Loading packages
#===============================================================================
#BTC.LineZero.Header.1.1.0
#===============================================================================
#R Markdown environment setup and reporting utility.
#===============================================================================
#RLB.Dependencies:
# knitr, magrittr, pacman, rio, rmarkdown, rmdformats, tibble, yaml
#===============================================================================
#Input for document parameters, libraries, file paths, and options.
#=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=
knitr::opts_chunk$set(message=FALSE, warning = FALSE)
path_working <- "/Users/minsikkim/Dropbox (Partners HealthCare)/Project_SICAS2_microbiome/5_Scripts/MGK/Host_depletion_git/"
path_library <- "/Library/Frameworks/R.framework/Resources/library"
str_libraries <- c(
"readxl", "phyloseq", "tidyverse", "pacman", "yaml"
)
path_working <- "/Users/minsikkim/Dropbox (Partners HealthCare)/Project_SICAS2_microbiome/5_Scripts/MGK/Host_depletion_git"
path_library <- "/Library/Frameworks/R.framework/Resources/library"
str_libraries <- c("readxl", "phyloseq", "tidyverse", "pacman", "yaml", "ggplot2", "vegan", "microbiome", "ggpubr", "viridis", "decontam", "gridExtra", "ggpubr", "lme4", "lmerTest", "writexl", "harrietr", "Maaslin2", "ggtext", "ggpmisc", "gridExtra", "gamm4", "reshape2", "kableExtra", "knitr", "ggtree", "car", "LDM", "mediation")
YAML_header <-
'---
title: "Host-DNA depletion 2: analysis - filtered"
author: "Minsik Kim"
date: "2032.04.23"
output:
rmdformats::downcute:
downcute_theme: "chaos"
code_folding: hide
fig_width: 6
fig_height: 6
---'
seed <- "20230423"
#=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
#Loads libraries, file paths, and other document options.
#=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
FUN.LineZero.Boot <- function() {
.libPaths(path_library)
require(pacman)
pacman::p_load(c("knitr", "rmarkdown", "rmdformats", "yaml"))
knitr::opts_knit$set(root.dir = path_working)
str_libraries |> unique() |> sort() -> str_libraries
pacman::p_load(char = str_libraries)
set.seed(seed)
}
FUN.LineZero.Boot()
#=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
#Outputs R environment report.
#=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
FUN.LineZero.Report <- function() {
cat("Line Zero Environment:\n\n")
paste("R:", pacman::p_version(), "\n") |> cat()
cat("Libraries:\n")
for (str_libraries in str_libraries) {
paste(
" ", str_libraries, ": ", pacman::p_version(package = str_libraries),
"\n", sep = ""
) |> cat()
}
paste("\nOperating System:", pacman::p_detectOS(), "\n") |> cat()
paste(" Library Path:", path_library, "\n") |> cat()
paste(" Working Path:", path_working, "\n") |> cat()
paste("Seed:", seed, "\n\n") |> cat()
cat("YAML Header:\n")
cat(YAML_header)
}
FUN.LineZero.Report()
## Line Zero Environment:
##
## R: 4.2.2
## Libraries:
## readxl: 1.4.2
## phyloseq: 1.40.0
## tidyverse: 2.0.0
## pacman: 0.5.1
## yaml: 2.3.7
## ggplot2: 3.4.1
## vegan: 2.6.4
## microbiome: 1.18.0
## ggpubr: 0.6.0
## viridis: 0.6.2
## decontam: 1.16.0
## gridExtra: 2.3
## ggpubr: 0.6.0
## lme4: 1.1.31
## lmerTest: 3.1.3
## writexl: 1.4.2
## harrietr: 0.2.3
## Maaslin2: 1.10.0
## ggtext: 0.1.2
## ggpmisc: 0.5.2
## gridExtra: 2.3
## gamm4: 0.2.6
## reshape2: 1.4.4
## kableExtra: 1.3.4
## knitr: 1.42
## ggtree: 3.4.4
## car: 3.1.1
## LDM: 6.0
## mediation: 4.5.0
##
## Operating System: Darwin
## Library Path: /Library/Frameworks/R.framework/Resources/library
## Working Path: /Users/minsikkim/Dropbox (Partners HealthCare)/Project_SICAS2_microbiome/5_Scripts/MGK/Host_depletion_git
## Seed: 20230423
##
## YAML Header:
## ---
## title: "Host-DNA depletion 2: analysis - filtered"
## author: "Minsik Kim"
## date: "2032.04.23"
## output:
## rmdformats::downcute:
## downcute_theme: "chaos"
## code_folding: hide
## fig_width: 6
## fig_height: 6
## ---
Script description
1. Loading data
1.1. phyloseq obejct
1.2. qPCR data (controls)
2. QC
QC1. How many samples failed sequencing
QC2. How were changes in read stats and host DNA proportion?
QC3. How were the extraction controls
QC4. Prevalence / abundance filtering - red flag
3. Analysis
A0. Calculation of alpha-diversity indices
A1. Host DNA, bacterial DNA and % host
A2. Modeling of sequencing results
A3. Taxa alpha diversity
A4. Mediation analysis on species richness
A5. Taxa beta diversity
A6. DA for taxa
A7. LM of function alpha diversity
A8. permanova of function alpha diversity
A9. DA for function
A10. Rarefaction analysis
Data inputs
Meta data
qPCR - bacteria
qPCR - human
qPCR host %
Raw reads
final reads
sequencing host %
library prep failure status
Raw reads
subject_id
treatment
sample_type
subject_id
Sequencing result
samples
controls
Loading data
#Parallel computing option
# Loading files -----------------------------------------------------------
#loading tidy phyloseq object
phyloseq <- read_rds("/Users/minsikkim/Dropbox (Partners HealthCare)/Project_SICAS2_microbiome/4_Data/2_Tidy/Phyloseq/PHY_20230521_MGK_host_tidy.rds")
#sample data loading
sample_data <- sample_data(phyloseq$phyloseq_count)
Q1. How were sequencing results?
Figure - regular scale
Raw scale is not normally distributed
# Initail QC --------------------------------------------------------------
#Quesetions - QC
#Q0. How many samples failed in sequencing
## figures -----raw data
sample_data %>%
subset(., !is.na(.$subject_id)) %>%
data.frame() %>%
gather(feature, value, Raw_reads:sequencing_host_prop) %>%
group_by(feature, sample_type) %>%
subset(., .$feature %in% c("Raw_reads", "Host_mapped", "Final_reads", "sequencing_host_prop")) %>%
mutate(feature = factor(feature, levels = c("Raw_reads", "Host_mapped", "Final_reads", "sequencing_host_prop"), labels = c("Raw reads", "Host mapped", "Final reads", "Host ratio"))) %>%
ggplot(aes(x = value, fill = treatment)) +
geom_histogram(bins = 97) +
guides(fill=guide_legend(title="Treatment", nrow = 1)) +
facet_grid(sample_type~feature, scales = "free") +
ggtitle("Raw value histrogram") +
theme_classic() +
theme(legend.position = "top")
Figure - log10 scale
log transform is adquate for read counts
Host% is not transfromed well
## figures -----log10
sample_data %>%
subset(., !is.na(.$subject_id)) %>%
data.frame() %>%
gather(feature, value, Raw_reads:sequencing_host_prop) %>%
group_by(feature, sample_type) %>%
subset(., .$feature %in% c("Raw_reads", "Host_mapped", "Final_reads", "sequencing_host_prop")) %>%
mutate(feature = factor(feature, levels = c("Raw_reads", "Host_mapped", "Final_reads", "sequencing_host_prop"), labels = c("Raw reads", "Host mapped", "Final reads", "Host ratio"))) %>%
ggplot(aes(x = log10(value), fill = treatment)) +
geom_histogram(bins = 97) +
guides(fill=guide_legend(title="Treatment", nrow = 1)) +
facet_grid(sample_type~feature, scales = "free") +
ggtitle("log10 transfromed histrogram") +
theme_classic() +
theme(legend.position = "top")
Figure - scaling host proportion
Raw % will be used for host%
## figures -----log10
sample_data %>%
subset(., !is.na(.$subject_id)) %>%
data.frame() %>%
dplyr::mutate(host_seq_percent = sequencing_host_prop,
log_seq_percent = log10(host_seq_percent),
sqrt_seq_percent = sqrt(host_seq_percent),
.after = sequencing_host_prop) %>%
gather(feature, value, Raw_reads:sqrt_seq_percent) %>%
group_by(feature, sample_type) %>%
subset(., .$feature %in% c("host_seq_percent", "log_seq_percent", "sqrt_seq_percent")) %>%
mutate(feature = factor(feature, levels = c("host_seq_percent", "log_seq_percent", "sqrt_seq_percent"), labels = c("Host ratio", "log10 (host ratio)", "Sqrt(host ratio)"))) %>%
ggplot(aes(x = value, fill = treatment)) +
geom_histogram(bins = 97) +
facet_grid(sample_type~feature, scales = "free") +
ggtitle("Host % transfromed (raw, log10, and sqrt) histrogram") +
guides(fill=guide_legend(title="Treatment", nrow = 1)) +
theme_classic() +
theme(legend.position = "top")
Figure - log10 scale by treatment
ggarrange(ggplot(sample_data %>% subset(., !is.na(.$subject_id)) %>% data.frame(), aes(x = Final_reads, fill = treatment)) +
geom_histogram(bins = 97) +
facet_wrap(~sample_type) +
theme_classic(base_family = "serif") +
ggtitle("Histogram of final reads by sample type and treatment") +
scale_fill_manual(values = c("#e31a1c", "#fb9a99", "#33a02c", "#b2df8a", "#1f78b4", "#a6cee3"), name = "Treatment"),
ggplot(sample_data %>% subset(., !is.na(.$subject_id)) %>% data.frame(), aes(x = log10(Final_reads), fill = treatment)) +
geom_histogram(bins = 97) +
facet_wrap(~sample_type) +
theme_classic(base_family = "serif") +
ggtitle("Histogram of log10(final reads) by sample type and treatment") +
scale_fill_manual(values = c("#e31a1c", "#fb9a99", "#33a02c", "#b2df8a", "#1f78b4", "#a6cee3"), name = "Treatment"),
common.legend = T, ncol = 1)
Histogram (sum of OTU table)
2 samples showed 0 reads in sum(OTU)
hist((log10((phyloseq$phyloseq_count %>% otu_table %>% colSums()) + 1)),100, main = "Histogram of total reads (sum of OTU table)") # 2 samples showed 0 total reads (sum of otu_table)
Final reads of by sample type
Some samples did not pass library prep QC, but showed reasonable final reads
#how were the samples failed in library prep?
sample_data %>% data.frame %>% mutate(total_read = phyloseq$phyloseq_count %>% otu_table %>% colSums()) %>%
ggplot(aes(x = reorder(baylor_other_id, -total_read),
y = log10(total_read + 1),
col = sample_type)) +
geom_point() +
theme_classic(base_family = "serif") +
theme(axis.title.y = element_markdown(), axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size = 4)) +
ylab("log<sub>10</sub>(Sum of OTU table reads)") +
xlab("Sample ID") +
guides(col=guide_legend(title="Library failed")) +
ggtitle("Sum of OTU reads by sample type")
Final reads of library prep failed samples
Some samples did not pass library prep QC, but showed reasonable final reads
#how were the samples failed in library prep?
sample_data %>% data.frame %>% mutate(total_read = phyloseq$phyloseq_count %>% otu_table %>% colSums()) %>%
ggplot(aes(x = reorder(baylor_other_id, -total_read),
y = log10(total_read + 1),
col = lib_failed)) +
geom_point() +
theme_classic(base_family = "serif") +
theme(axis.title.y = element_markdown(), axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size = 4)) +
ylab("log<sub>10</sub>(Sum of OTU table reads)") +
xlab("Sample ID") +
guides(col=guide_legend(title="Library failed")) +
ggtitle("Sum of OTU reads by library failure status")
Raw reads, Mapped reads, host reads, final reads, and sumOTU
Some samples did not pass library prep QC, but showed reasonable final reads
#how were the samples failed in library prep?
sample_data %>% data.frame %>%
mutate(total_read = phyloseq$phyloseq_count %>%
otu_table %>% colSums()) %>%
melt(id.vars=c("baylor_other_id"),
measure.vars=c("Raw_reads", "LowQual_removed", "Reads_after_trim", "Host_mapped", "Final_reads", "Metaphlan_mapped", "total_read"),
variable.name="category",
value.name="reads") %>%
mutate(category = factor(category, levels = c("Raw_reads", "LowQual_removed", "Reads_after_trim", "Host_mapped", "Final_reads", "Metaphlan_mapped", "total_read"),
labels = c("Raw", "Low qual removed", "Trimmed reads","Host", "Final", "Metaphlan", "OTU sum"))) %>%
ggplot(aes(x = reorder(baylor_other_id, -reads),
y = log10(reads + 1),
col = category)) +
geom_point() +
theme_classic(base_family = "serif") +
theme(axis.title.y = element_markdown(), axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size = 4)) +
ylab("log<sub>10</sub>(reads + 1)") +
xlab("Sample ID") +
guides(col=guide_legend(title="Library failed")) +
ggtitle("Read counts by samples at each data processing step")
List of samples failed in sequencing
2 BAL samples (control and lyPMA group) failed in sequencing
sample_data %>% data.frame %>% filter(phyloseq$phyloseq_count %>% otu_table %>% colSums() == 0) # two BAL sampels showed 0 total reads
#sample_data(phyloseq$phyloseq_count) %>% data.frame() %>% subset(., .$lib_failed)
QC 1 Results:
1.1 Modeling final read should be conducted with log transfrom. Host % need no transformation.
1.2 13 samples failed in library prep
1.3. Two BAL sampels showed 0 total reads
1.4. Sequencing fail ≠ library prep fail
Comments from Baylor:
Q: What was the lab’s criteria for deciding which samples failed library prep.? There were 13 samples that you pointed as failed but their sequencing result actually looks pretty good (ie similar to samples that didn’t fail library prep)
A: To determine whether a library attempt “passed or failed” the lab looks at the picogreen concentrations and a library fragment size distribution trace. The trace files are an output from either the Fragment Analyzer or TapeStation (a copy of the trace files for PQ00331 is attached). If a sample has a background level pico concentration and no discernable fragment concentration on the trace (i.e. a flat trace line) it is considered failed library. If the sample is below the level of detection of our standard library QC methods, it is considered failure. It’s still possible that there is some small amounts of library in those samples that were successfully sequenced, but often those samples do not generate a meaningful amount of sequence data.
QC2 Chagnes of reads and host % by treatment
For detailed analysis, sequencing matrices were analyzed by each sample type and treatment
Reads and host % by treatment
QC table by treated (binary)
Changes in matrices were observed
#sequencing result by sample type and control (1/0)
sample_data %>% data.frame() %>%
dplyr::group_by(sample_type, treated) %>%
dplyr::summarise(N = n(),
`Raw reads<br>(median [IQR])<br>[reads x 10<sup>7</sup>]` = paste(format(round(median(Raw_reads/10000000),2),nsmall = 2, big.mark = ","), " [", format(round(quantile(Raw_reads/10000000, 0.25),2), nsmall = 2, big.mark = ","), ", ", format(round(quantile(Raw_reads/10000000, 0.75),2), nsmall = 2, big.mark = ","), "]", sep = ""),
`Host reads<br>(median [IQR])<br>[reads x 10<sup>7</sup>]` = paste(format(round(median(Host_mapped/10000000),2), nsmall = 2, big.mark = ","), " [", format(round(quantile(Host_mapped/10000000, 0.25),2), nsmall = 2, big.mark = ","), ", ", format(round(quantile(Host_mapped/10000000, 0.75),2), nsmall = 2, big.mark = ","), "]", sep = ""),
`Host reads proportion<br>(median [IQR])<br>[%]` = paste(format(round(median(sequencing_host_prop * 100),2), nsmall = 2, big.mark = ","), " [", format(round(quantile(sequencing_host_prop * 100, 0.25),2), nsmall = 2, big.mark = ","), ", ", format(round(quantile(sequencing_host_prop * 100, 0.75),2), nsmall = 2, big.mark = ","), "]", sep = ""),
`Final reads<br>(median [IQR])<br>[reads x 10<sup>7</sup>]` = paste(format(round(median(Final_reads/10000000),2), nsmall = 2, big.mark = ","), " [", format(round(quantile(Final_reads/10000000, 0.25),2), nsmall = 2, big.mark = ","), ", ", format(round(quantile(Final_reads/10000000, 0.75),2), nsmall = 2, big.mark = ","), "]", sep = ""),
) %>%
dplyr::rename(`Sample type` = sample_type, Treated = treated) %>%
data.frame(check.names = F) %>% mutate_all(linebreak) %>% kbl(format = "html", escape = F) %>% kable_styling(full_width = 0, html_font = "serif")
| Sample type | Treated | N |
Raw reads (median [IQR]) [reads x 107] |
Host reads (median [IQR]) [reads x 107] |
Host reads proportion (median [IQR]) [%] |
Final reads (median [IQR]) [reads x 107] |
|---|---|---|---|---|---|---|
| Neg. | 0 | 6 | 0.20 [0.17, 0.22] | 0.02 [0.01, 0.02] | 16.85 [7.97, 20.09] | 0.08 [0.06, 0.11] |
| Neg. | 1 | 25 | 0.22 [0.17, 0.30] | 0.02 [0.02, 0.03] | 16.70 [14.76, 20.97] | 0.10 [0.08, 0.14] |
| Mock | 0 | 6 | 10.88 [10.36, 11.02] | 0.03 [0.02, 0.03] | 0.30 [0.28, 0.31] | 10.02 [9.60, 10.21] |
| Mock | 1 | 25 | 10.58 [8.16, 11.83] | 0.07 [0.06, 0.07] | 0.64 [0.63, 0.66] | 9.79 [7.29, 10.87] |
| BAL | 0 | 5 | 15.73 [6.35, 15.92] | 12.92 [5.21, 12.94] | 99.72 [99.59, 99.75] | 0.03 [0.03, 0.04] |
| BAL | 1 | 25 | 6.17 [4.57, 17.43] | 4.65 [2.78, 12.80] | 95.83 [87.19, 98.81] | 0.17 [0.10, 0.37] |
| Nasal | 0 | 10 | 13.09 [7.73, 16.93] | 10.05 [6.11, 13.04] | 94.05 [92.82, 97.87] | 0.48 [0.10, 0.87] |
| Nasal | 1 | 25 | 4.08 [0.99, 6.40] | 0.81 [0.26, 1.36] | 32.80 [15.74, 78.71] | 0.97 [0.17, 3.42] |
| Sputum | 0 | 5 | 8.59 [8.25, 9.27] | 6.87 [6.69, 7.50] | 99.19 [98.86, 99.21] | 0.06 [0.06, 0.09] |
| Sputum | 1 | 25 | 12.23 [10.34, 13.73] | 7.71 [3.76, 8.82] | 87.45 [47.33, 92.94] | 1.16 [0.47, 4.19] |
QC table by treatment methods
Changes were sample type * treatment specific
table1 <- sample_data %>% data.frame() %>%
dplyr::filter(sample_type %in% c("Sputum", "Nasal", "BAL")) %>%
group_by (sample_type, treatment) %>%
dplyr::summarise(N = n(),
`Total DNA <br>ng/µL` = paste(format(round(median(DNA_host_ng_uL + DNA_bac_ng_uL),2), nsmall = 2, big.mark = ","), "<br>(", format(round(quantile(DNA_host_ng_uL + DNA_bac_ng_uL, 0.25),2), nsmall = 2, big.mark = ","), ", ", format(round(quantile(DNA_host_ng_uL + DNA_bac_ng_uL, 0.75),2), nsmall = 2, big.mark = ","), ")", sep = ""),
`Library or sequencing failure<br>N (%)` = paste(sum(lib_failed), "<br>(", sum(lib_failed) / n() * 100, " %)", sep = ""),
`Clean reads<br>reads x 10<sup>7</sup>` = paste(format(round(median(Reads_after_trim/10000000),2), nsmall = 2, big.mark = ","), "<br>(", format(round(quantile(Reads_after_trim/10000000, 0.25),2), nsmall = 2, big.mark = ","), ", ", format(round(quantile(Reads_after_trim/10000000, 0.75), 1), nsmall = 1, big.mark = ","), ")", sep = ""),
`Host reads %` = paste(format(round(median(sequencing_host_prop) * 100, 1), nsmall = 1, big.mark = ","), "<br>(", format(round(quantile(sequencing_host_prop, 0.25) * 100, 1), nsmall = 1, big.mark = ","), ", ", format(round(quantile(sequencing_host_prop, 0.75) * 100, 1), nsmall = 1, big.mark = ","), ")", sep = ""),
`Final reads<br>reads x 10<sup>7</sup>` = paste(format(round(median(Final_reads/10000000),2), nsmall = 2, big.mark = ","), "<br>(", format(round(quantile(Final_reads/10000000, 0.25),2), nsmall = 2, big.mark = ","), ", ", format(round(quantile(Final_reads/10000000, 0.75),2), nsmall = 2, big.mark = ","), ")", sep = "")
) %>% data.frame(check.names = F) %>%
arrange(sample_type, treatment) %>%
dplyr::rename(`Sample type` = sample_type, Treatment = treatment) %>%
mutate_all(linebreak) %>% kbl(format = "html", escape = F) %>% kable_styling(full_width = 0, html_font = "serif")
table1
| Sample type | Treatment | N |
Total DNA ng/µL |
Library or sequencing failure N (%) |
Clean reads reads x 107 |
Host reads % |
Final reads reads x 107 |
|---|---|---|---|---|---|---|---|
| BAL | Untreated | 5 |
1.53 (1.25, 9.87) |
0 (0 %) |
12.95 (5.25, 13.0) |
99.7 (99.6, 99.7) |
0.03 (0.03, 0.04) |
| BAL | lyPMA | 5 |
2.15 (0.06, 6.27) |
1 (20 %) |
4.67 (2.86, 11.0) |
99.1 (97.8, 99.5) |
0.06 (0.04, 0.10) |
| BAL | Benzonase | 5 |
0.06 (0.05, 0.08) |
0 (0 %) |
14.93 (12.97, 18.4) |
98.8 (98.7, 98.9) |
0.17 (0.16, 0.22) |
| BAL | Host zero | 5 |
0.01 (0.00, 0.01) |
1 (20 %) |
3.19 (1.84, 3.5) |
83.7 (76.8, 87.2) |
0.24 (0.13, 0.82) |
| BAL | Molysis | 5 |
0.02 (0.01, 0.03) |
1 (20 %) |
3.90 (2.90, 3.9) |
92.5 (92.5, 93.6) |
0.29 (0.13, 1.56) |
| BAL | QIAamp | 5 |
0.03 (0.03, 0.08) |
0 (0 %) |
13.24 (11.96, 13.8) |
98.3 (92.3, 98.6) |
0.26 (0.10, 1.02) |
| Nasal | Untreated | 10 |
0.37 (0.22, 0.71) |
0 (0 %) |
10.62 (6.37, 13.9) |
94.1 (92.8, 97.9) |
0.48 (0.10, 0.87) |
| Nasal | lyPMA | 5 |
0.00 (0.00, 0.01) |
4 (80 %) |
0.79 (0.69, 1.0) |
91.2 (35.6, 91.6) |
0.07 (0.06, 0.08) |
| Nasal | Benzonase | 5 |
0.02 (0.01, 0.08) |
0 (0 %) |
4.71 (4.17, 5.3) |
78.7 (77.8, 94.8) |
0.28 (0.26, 1.04) |
| Nasal | Host zero | 5 |
0.01 (0.00, 0.02) |
2 (40 %) |
2.45 (1.17, 5.5) |
8.9 (2.7, 30.4) |
2.43 (0.97, 5.03) |
| Nasal | Molysis | 5 |
0.00 (0.00, 0.01) |
4 (80 %) |
0.81 (0.50, 3.5) |
49.9 (5.0, 78.4) |
0.32 (0.17, 2.53) |
| Nasal | QIAamp | 5 |
0.03 (0.03, 0.04) |
0 (0 %) |
5.62 (5.49, 5.8) |
20.1 (15.7, 23.2) |
4.63 (4.50, 4.67) |
| Sputum | Untreated | 5 |
39.58 (19.62, 59.74) |
0 (0 %) |
6.92 (6.80, 7.6) |
99.2 (98.9, 99.2) |
0.06 (0.06, 0.09) |
| Sputum | lyPMA | 5 |
9.88 (1.02, 11.54) |
0 (0 %) |
8.97 (4.20, 10.5) |
96.4 (92.5, 98.3) |
0.25 (0.15, 0.44) |
| Sputum | Benzonase | 5 |
0.19 (0.16, 0.44) |
0 (0 %) |
8.40 (8.20, 8.7) |
94.2 (92.9, 94.5) |
0.47 (0.45, 0.59) |
| Sputum | Host zero | 5 |
0.09 (0.05, 0.09) |
0 (0 %) |
10.62 (6.16, 11.5) |
61.7 (37.5, 68.0) |
2.91 (2.36, 3.67) |
| Sputum | Molysis | 5 |
0.04 (0.04, 0.05) |
0 (0 %) |
10.56 (9.08, 11.6) |
32.8 (17.0, 33.8) |
6.11 (5.56, 8.37) |
| Sputum | QIAamp | 5 |
0.28 (0.23, 0.50) |
0 (0 %) |
10.24 (10.09, 10.6) |
88.2 (68.9, 88.6) |
1.16 (1.13, 3.89) |
Figure of reads by treatment (z-score)
Changes were sample type * treatment specific
# Summary figures - facet and z-score -------------------------------------
sample_data %>%
subset(., !is.na(.$subject_id)) %>%
data.frame() %>%
gather(feature, value, Raw_reads:sequencing_host_prop) %>%
group_by(feature, sample_type) %>%
subset(., .$feature %in% c("Raw_reads", "Host_mapped", "Final_reads", "sequencing_host_prop")) %>%
mutate(z_score = scale(value),
feature = factor(feature, levels = c("Raw_reads", "Host_mapped", "Final_reads", "sequencing_host_prop"), labels = c("Raw reads", "Host mapped", "Final reads", "Host %"))) %>%
ggplot(aes(x = treatment, y = z_score, fill = treatment)) +
geom_boxplot(lwd = 0.2) +
guides(fill=guide_legend(title="Treatment", nrow = 1)) +
facet_grid(sample_type~feature) +
xlab("Treatment") +
ylab("Z score") +
theme_classic(base_family = "serif", base_size = 14) +
guides( x = guide_axis(angle = 90)) +
theme(legend.position = "top") +
scale_fill_manual(values = c("#e31a1c", "#fb9a99", "#33a02c", "#b2df8a", "#1f78b4", "#a6cee3"), name = "Treatment") #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
Results:
2.1. There were no differences in raw reads.
2.2. However, final reads increased after some treatment, and host DNA proportion decreased
QC3. Positive and negative controls
Positive and negative controls were compared with mock community
Reads and host % by treatment
Species richness of controls
Some possible contaminants were identified in extraction controls
#Loading theoretical mock community
zymo_mock <- read_excel("/Users/minsikkim/Dropbox (Partners HealthCare)/@minsik/project_sicas2/data_raw/DAR_20210929_zymo_mock_data.xlsx") %>%
data.frame(row.names = T) %>% rename(mock_theoretical = Mock) %>% mutate(mock_theoretical = mock_theoretical/100) %>%
merge_phyloseq(otu_table(., taxa_are_rows = T), tax_table(phyloseq$phyloseq_count))
phyloseq_mock <- rbind(c("mock_theoretical", "Mock theoretical", "-")) %>% data.frame() %>%
column_to_rownames(var = "X1") %>% rename(sample_type = X2, treatment = X3) %>% #making sample_data of mock community
merge_phyloseq(sample_data(.), zymo_mock)
phyloseq_control_rel <- subset_samples(phyloseq$phyloseq_rel, sample_type == "Mock" | sample_type == "Neg.") #adding data of controls
sample_data(phyloseq_control_rel)$treatment <- sample_data(phyloseq_control_rel)$treatment %>% as.character()
sample_data(phyloseq_control_rel)$sample_type <- sample_data(phyloseq_control_rel)$sample_type %>% as.character()
phyloseq_control_rel <- merge_phyloseq(phyloseq_control_rel, phyloseq_mock)
#Species richness of each control groups
sample_data(phyloseq_control_rel)$S.obs <- rowSums(t(otu_table(phyloseq_control_rel)) != 0)
sample_data(phyloseq_control_rel)$sample_type <-
factor(sample_data(phyloseq_control_rel)$sample_type, levels = c("Mock theoretical", "Mock", "Neg."))
sample_data(phyloseq_control_rel)$teratment <-
factor(sample_data(phyloseq_control_rel)$treatment, levels = c("-", "Control", "lyPMA", "Benzonase", "Host zero", "Molysis", "QIAamp"))
phyloseq_control_rel %>%
sample_data() %>%
#mutate(sample_type = factor(sample_type, levels = c("Mock", "Neg.")),
# treatment = factor(treatment, levels = c("Theoretical", "Control", "Benzonase", "Host zero", "Molysis", "QIAamp"))) %>%
group_by(sample_type, treatment) %>%
summarise(Mean = mean(S.obs),
SD = sd(S.obs),) %>%
kbl(format = "html", caption = "Species richness of controls") %>%
kable_styling(full_width = 0, html_font = "serif")
| sample_type | treatment | Mean | SD |
|---|---|---|---|
| Mock theoretical |
|
10.000000 | NA |
| Mock | Benzonase | 24.400000 | 1.1401754 |
| Mock | Host zero | 27.000000 | 7.8421936 |
| Mock | Molysis | 28.400000 | 1.1401754 |
| Mock | QIAamp | 25.800000 | 0.4472136 |
| Mock | Untreated | 41.333333 | 1.5055453 |
| Mock | lyPMA | 39.800000 | 15.9279628 |
| Neg. | Benzonase | 6.800000 | 2.3874673 |
| Neg. | Host zero | 9.400000 | 2.1908902 |
| Neg. | Molysis | 8.200000 | 3.3466401 |
| Neg. | QIAamp | 9.200000 | 3.4205263 |
| Neg. | Untreated | 8.333333 | 7.9162281 |
| Neg. | lyPMA | 11.000000 | 12.3490890 |
Bar plot of controls
Some possible contaminants were identified in extraction controls
Some changes visible at postive control….
#Manipulating phyloseq - only top 10
tax_table(phyloseq_control_rel) %>%
cbind(species20 = "[Others]") %>%
{top20species <- head(taxa_sums(phyloseq_control_rel) %>%
data.frame %>%
arrange(-.) %>%
row.names(), 10)
.[top20species, "species20"] <- as.character(.[top20species, "Species"])
.[, 8] <- .[, 8] %>% gsub("s__", "", .) %>% gsub("_", " ", .) %>% paste("<i>", ., "</i>", sep = "")
phyloseq_temp <- phyloseq_control_rel
tax_table(phyloseq_temp) <- tax_table(.)
phyloseq_temp
} %>%
plot_bar(., fill="species20") +
ylab("Relative abundancne") +
theme_classic(base_size = 11, base_family = "serif") +
ggtitle("Bar plot of control data") +
theme(legend.text = element_markdown()) +
guides(fill=guide_legend(title="Top 10 species")) +
facet_wrap (~ sample_type, scales= "free_x", nrow=1)
#there could be opportunistic pathogens...
Bar plot of controls (Positive)
Some possible contaminants were identified in extraction controls
Gram negatives were fragile to depletion method at postivive control
#Manipulating phyloseq - only top 10
phyloseq_control_rel %>%
subset_samples(., sample_type == "Mock") %>%
tax_table() %>%
cbind(species20 = "[Others]") %>%
{top20species <- head(taxa_sums(subset_samples(phyloseq_control_rel,sample_type == "Mock" & S.obs != 0)) %>%
data.frame %>%
arrange(-.) %>%
row.names(), 10)
.[top20species, "species20"] <- as.character(.[top20species, "Species"])
.[, 8] <- .[, 8] %>% gsub("s__", "", .) %>% gsub("_", " ", .) %>% paste("<i>", ., "</i>", sep = "")
phyloseq_temp <- subset_samples(phyloseq_control_rel,sample_type == "Mock" & S.obs != 0)
tax_table(phyloseq_temp) <- tax_table(.)
phyloseq_temp
} %>%
plot_bar(., fill="species20") +
ylab("Relative abundancne") +
theme_classic(base_size = 11, base_family = "serif") +
ggtitle("Postive controls") +
theme(legend.text = element_markdown()) +
guides(fill=guide_legend(title="Top 10 species")) +
facet_wrap (~ factor(treatment, levels = c("Control", "lyPMA", "Benzonase", "Host zero", "Molysis", "QIAamp")),
scales= "free_x", nrow=1)
#there could be opportunistic pathogens...
Issue 1 - gram negative
Benzonse and Host zero depleted all the gram negative strains
Others decrased gram negatives a lot, but not became zero
#Making function without borders
my_plot_bar = function (physeq, x = "Sample", y = "Abundance", fill = NULL, title = NULL,
facet_grid = NULL) {
mdf = psmelt(physeq)
p = ggplot(mdf, aes_string(x = x, y = y, fill = fill))
p = p + geom_bar(stat = "identity")
p = p + theme(axis.text.x = element_text(angle = -90, hjust = 0))
if (!is.null(facet_grid)) {
p <- p + facet_grid(facet_grid)
}
if (!is.null(title)) {
p <- p + ggtitle(title)
}
return(p)
}
# gram stain data
phyloseq_control_rel %>%
subset_samples(., sample_type == "Mock") %>%
tax_table() %>%
cbind(species20 = "[Others]") %>%
{top20species <- head(taxa_sums(subset_samples(phyloseq_control_rel,sample_type == "Mock" & S.obs != 0)) %>%
data.frame %>%
arrange(-.) %>%
row.names(), 10)
.[top20species, "species20"] <- as.character(.[top20species, "Species"])
.[, 8] <- .[, 8] %>% gsub("s__", "", .) %>% gsub("_", " ", .) %>% paste("<i>", ., "</i>", sep = "")
phyloseq_temp <- subset_samples(phyloseq_control_rel,sample_type == "Mock" & S.obs != 0)
tax_table(phyloseq_temp) <- tax_table(.)
phyloseq_temp
} %>%
my_plot_bar(., fill="gram_stain") +
ylab("Relative abundancne") +
theme_classic(base_size = 11, base_family = "serif") +
ggtitle("Gram stain in Zymo mock") +
theme(legend.text = element_markdown()) +
guides(fill=guide_legend(title="Gram-stain")) +
facet_wrap (~ factor(treatment, levels = c("Untreated", "lyPMA", "Benzonase", "Host zero", "Molysis", "QIAamp")),
scales= "free_x", nrow=1)
#there could be opportunistic pathogens...
#Manipulating phyloseq - only top 10
sample_data(phyloseq_control_rel) <- cbind(phyloseq_control_rel %>%
sample_data %>%
data.frame(),
phyloseq_control_rel %>%
otu_table %>%
data.frame %>%
subset(., rownames(.) %in% head(taxa_sums(subset_samples(phyloseq_control_rel,sample_type == "Mock" & S.obs != 0)) %>%
data.frame %>%
arrange(-.) %>%
row.names(), 10)) %>%
t()
)
sample_data(phyloseq_control_rel) %>%
data.frame() %>%
subset(., !is.na(.$Escherichia_coli)) %>%
group_by(sample_type, treatment) %>%
summarise(N = n(),
`<i>Escherichia coli</i><br>(median [IQR])` = paste(format(round(median(Escherichia_coli),4),nsmall = 4, big.mark = ","), " [", format(round(quantile(Escherichia_coli, 0.25),4), nsmall = 4, big.mark = ","), ", ", format(round(quantile(Escherichia_coli, 0.75),4), nsmall = 4, big.mark = ","), "]", sep = ""),
`<i>Pseudomonas aeruginosa</i><br>(median [IQR])` = paste(format(round(median(Pseudomonas_aeruginosa_group),4),nsmall = 4, big.mark = ","), " [", format(round(quantile(Pseudomonas_aeruginosa_group, 0.25),4), nsmall = 4, big.mark = ","), ", ", format(round(quantile(Pseudomonas_aeruginosa_group, 0.75),4), nsmall = 4, big.mark = ","), "]", sep = ""),
`<i>Salmonella enterica</i><br>(median [IQR])` = paste(format(round(median(Salmonella_enterica),4),nsmall = 4, big.mark = ","), " [", format(round(quantile(Salmonella_enterica, 0.25),4), nsmall = 4, big.mark = ","), ", ", format(round(quantile(Salmonella_enterica, 0.75),4), nsmall = 4, big.mark = ","), "]", sep = "")
) %>%
rename(`Sample type` = sample_type) %>%
data.frame(check.names = F) %>% mutate_all(linebreak) %>% kbl(format = "html", escape = F) %>% kable_styling(full_width = 0, html_font = "serif")
| Sample type | treatment | N |
Escherichia coli (median [IQR]) |
Pseudomonas aeruginosa (median [IQR]) |
Salmonella enterica (median [IQR]) |
|---|---|---|---|---|---|
| Mock theoretical |
|
1 | 0.1200 [0.1200, 0.1200] | 0.1200 [0.1200, 0.1200] | 0.1200 [0.1200, 0.1200] |
| Mock | Benzonase | 5 | 0.0000 [0.0000, 0.0000] | 0.0000 [0.0000, 0.0000] | 0.0000 [0.0000, 0.0000] |
| Mock | Host zero | 5 | 0.0000 [0.0000, 0.0000] | 0.0000 [0.0000, 0.0000] | 0.0000 [0.0000, 0.0000] |
| Mock | Molysis | 5 | 0.0029 [0.0022, 0.0044] | 5e-04 [2e-04, 6e-04] | 0.0031 [0.0022, 0.0036] |
| Mock | QIAamp | 5 | 5e-04 [5e-04, 0.0010] | 1e-04 [1e-04, 1e-04] | 5e-04 [5e-04, 7e-04] |
| Mock | Untreated | 6 | 0.3118 [0.2964, 0.3231] | 0.0791 [0.0771, 0.0810] | 0.2405 [0.2314, 0.2467] |
| Mock | lyPMA | 5 | 0.1514 [0.1427, 0.2281] | 0.0477 [0.0443, 0.0576] | 0.1168 [0.1035, 0.1840] |
| Neg. | Benzonase | 5 | 0.0000 [0.0000, 0.0000] | 0.0000 [0.0000, 0.0000] | 0.0000 [0.0000, 0.0000] |
| Neg. | Host zero | 5 | 0.0000 [0.0000, 0.0000] | 0.0000 [0.0000, 0.0000] | 0.0000 [0.0000, 0.0000] |
| Neg. | Molysis | 5 | 0.0000 [0.0000, 0.0000] | 0.0000 [0.0000, 0.0000] | 0.0000 [0.0000, 0.0000] |
| Neg. | QIAamp | 5 | 0.0000 [0.0000, 0.0000] | 0.0000 [0.0000, 0.0000] | 0.0000 [0.0000, 0.0000] |
| Neg. | Untreated | 6 | 0.0000 [0.0000, 0.0000] | 0.0000 [0.0000, 0.0000] | 0.0000 [0.0000, 0.0000] |
| Neg. | lyPMA | 5 | 0.0000 [0.0000, 0.0000] | 0.0000 [0.0000, 0.0000] | 0.0000 [0.0000, 0.0000] |
Issue 2 - Positive controls contaminants
Some possible contaminants were identified in most of samples
This could be 1) background contamination or 2) cross-contamination from kingfisher. Most of these are gram positives. Negative controls should be double-checked
#Manipulating phyloseq - only top 10
phyloseq_control_rel_contam <- subset_taxa(phyloseq_control_rel , !(taxa_names(phyloseq_control_rel) %in% head(taxa_sums(subset_samples(phyloseq_control_rel,
sample_type == "Mock" & S.obs != 0)) %>%
data.frame %>%
arrange(-.) %>%
row.names(), 10))
)
phyloseq_control_rel_contam <- subset_taxa(phyloseq_control_rel_contam, taxa_sums(phyloseq_control_rel_contam) != 0)
phyloseq_control_rel_contam <- subset_samples(phyloseq_control_rel_contam, sample_type != "Neg." & S.obs != 0)
tax_table(phyloseq_control_rel_contam) %>%
cbind(species20 = "[Others]") %>%
{top20species <- head(taxa_sums(phyloseq_control_rel_contam) %>%
data.frame %>%
arrange(-.) %>%
row.names(), 10)
.[top20species, "species20"] <- as.character(.[top20species, "Species"])
.[, 9] <- .[, 9] %>% gsub("s__", "", .) %>% gsub("_", " ", .) %>% paste("<i>", ., "</i>", sep = "")
phyloseq_temp <- phyloseq_control_rel_contam
tax_table(phyloseq_temp) <- tax_table(.)
phyloseq_temp
} %>%
my_plot_bar(., fill="species20") +
ylab("Relative abundancne") +
theme_classic(base_size = 11, base_family = "serif") +
ggtitle("Contaminants in Zymo mock") +
theme(legend.text = element_markdown()) +
guides(fill=guide_legend(title="Top 10 species")) +
facet_wrap (~ factor(treatment, levels = c("Control", "lyPMA", "Benzonase", "Host zero", "Molysis", "QIAamp")),
scales= "free_x", nrow=1)
Negative controls
Contaminants of positive and negative control do not match
It seems without host DNA, gram-negatives are vulnerable to depletion methods.
These negative contaminants highly likely introduced after-depletion
#Manipulating phyloseq - only top 10
tax_table(phyloseq_control_rel) %>%
cbind(species20 = "[Others]") %>%
{top20species <- head(taxa_sums(phyloseq_control_rel) %>%
data.frame %>%
arrange(-.) %>%
row.names(), 10)
.[top20species, "species20"] <- as.character(.[top20species, "Species"])
.[, 9] <- .[, 9] %>% gsub("s__", "", .) %>% gsub("_", " ", .) %>% paste("<i>", ., "</i>", sep = "")
phyloseq_temp <- phyloseq_control_rel
tax_table(phyloseq_temp) <- tax_table(.)
phyloseq_temp
} %>%
subset_samples(., sample_type == "Neg.") %>%
my_plot_bar(., fill="species20") +
ylab("Relative abundancne") +
theme_classic(base_size = 11, base_family = "serif") +
ggtitle("Barplot of neg. data") +
theme(legend.text = element_markdown()) +
guides(fill=guide_legend(title="Top 10 species")) +
facet_wrap (~ factor(treatment, levels = c("Untreated", "lyPMA", "Benzonase", "Host zero", "Molysis", "QIAamp")),
scales= "free_x", nrow=1)
phyloseq_control_rel %>%
subset_samples(., sample_type == "Neg.") %>%
my_plot_bar(., fill="gram_stain") +
ylab("Relative abundancne") +
theme_classic(base_size = 11, base_family = "serif") +
ggtitle("Gram-stain of negative data") +
theme(legend.text = element_markdown()) +
guides(fill=guide_legend(title="Top 10 species")) +
facet_wrap (~ factor(treatment, levels = c("Untreated", "lyPMA", "Benzonase", "Host zero", "Molysis", "QIAamp")),
scales= "free_x", nrow=1)
Samples - gram-stain?
Nasal swab had low gram positive/pattern consisted after depletion
BAL showed Similar gram - / + ratio
Sputum showed high decrease in gram negative bacteria
Freeze/thaw cycle could be associated
Currently no further analysis is possible
#Manipulating phyloseq - only top 10
tax_table(phyloseq$phyloseq_rel) %>%
cbind(species20 = "[Others]") %>%
{top20species <- head(taxa_sums(phyloseq$phyloseq_rel) %>%
data.frame %>%
arrange(-.) %>%
row.names(), 10)
.[top20species, "species20"] <- as.character(.[top20species, "Species"])
.[, 9] <- .[, 9] %>% gsub("s__", "", .) %>% gsub("_", " ", .) %>% paste("<i>", ., "</i>", sep = "")
phyloseq_temp <- phyloseq$phyloseq_rel
tax_table(phyloseq_temp) <- tax_table(.)
phyloseq_temp
} %>%
subset_samples(., sample_type == "Nasal") %>%
my_plot_bar(., fill="gram_stain") +
ylab("Relative abundancne") +
theme_classic(base_size = 11, base_family = "serif") +
ggtitle("Gram stain of nasal samples") +
theme(legend.text = element_markdown()) +
guides(fill=guide_legend(title="Top 10 species")) +
facet_wrap (~ factor(treatment, levels = c("Untreated", "lyPMA", "Benzonase", "Host zero", "Molysis", "QIAamp")),
scales= "free_x", nrow=1)
tax_table(phyloseq$phyloseq_rel) %>%
cbind(species20 = "[Others]") %>%
{top20species <- head(taxa_sums(phyloseq$phyloseq_rel) %>%
data.frame %>%
arrange(-.) %>%
row.names(), 10)
.[top20species, "species20"] <- as.character(.[top20species, "Species"])
.[, 9] <- .[, 9] %>% gsub("s__", "", .) %>% gsub("_", " ", .) %>% paste("<i>", ., "</i>", sep = "")
phyloseq_temp <- phyloseq$phyloseq_rel
tax_table(phyloseq_temp) <- tax_table(.)
phyloseq_temp
} %>%
subset_samples(., sample_type == "BAL") %>%
my_plot_bar(., fill="gram_stain") +
ylab("Relative abundancne") +
theme_classic(base_size = 11, base_family = "serif") +
ggtitle("Gram stain of BAL samples") +
theme(legend.text = element_markdown()) +
guides(fill=guide_legend(title="Top 10 species")) +
facet_wrap (~ factor(treatment, levels = c("Untreated", "lyPMA", "Benzonase", "Host zero", "Molysis", "QIAamp")),
scales= "free_x", nrow=1)
tax_table(phyloseq$phyloseq_rel) %>%
cbind(species20 = "[Others]") %>%
{top20species <- head(taxa_sums(phyloseq$phyloseq_rel) %>%
data.frame %>%
arrange(-.) %>%
row.names(), 10)
.[top20species, "species20"] <- as.character(.[top20species, "Species"])
.[, 9] <- .[, 9] %>% gsub("s__", "", .) %>% gsub("_", " ", .) %>% paste("<i>", ., "</i>", sep = "")
phyloseq_temp <- phyloseq$phyloseq_rel
tax_table(phyloseq_temp) <- tax_table(.)
phyloseq_temp
} %>%
subset_samples(., sample_type == "Sputum") %>%
my_plot_bar(., fill="gram_stain") +
ylab("Relative abundancne") +
theme_classic(base_size = 11, base_family = "serif") +
ggtitle("Gram stain of sputum samples") +
theme(legend.text = element_markdown()) +
scale_fill_brewer(type = "qual", palette = 6) +
guides(fill=guide_legend(title="Gram-stain")) +
facet_wrap (~ factor(treatment, levels = c("Untreated", "lyPMA", "Benzonase", "Host zero", "Molysis", "QIAamp")),
scales= "free_x", nrow=1)
Results
2.3.1. Negative control showed minimal number of possible contaminants
2.3.2. Positive control contained various contaminants
QC4. Prevalence and abundacne filtering - red flag
Taxa prevance and abundance were checked.
Taxa abundance and prevalence
Histogram of prelanence taxa
No prevalence or abundance filtering (each experimental group is 5% of total sample)
Before anlayzing, alpha diversity indices were calculated for all phyloseq objects
alpha_diversity <- function(data) {
otu_table <- otu_table(data) %>% .[colSums(.) !=0]
S.obs <- rowSums(t(otu_table) != 0)
sample_data <- sample_data(data)
data_evenness <- vegan::diversity(t(otu_table)) / log(vegan::specnumber(t(otu_table))) # calculate evenness index using vegan package
data_shannon <- vegan::diversity(t(otu_table), index = "shannon") # calculate Shannon index using vegan package
data_hill <- exp(data_shannon) # calculate Hills index
data_dominance <- microbiome::dominance(otu_table, index = "all", rank = 1, aggregate = TRUE) # dominance (Berger-Parker index), etc.
data_invsimpson <- vegan::diversity(t(otu_table), index = "invsimpson") # calculate Shannon index using vegan package
alpha_diversity <- cbind(S.obs, data_shannon, data_hill, data_invsimpson, data_evenness,data_dominance) # combine all indices in one data table
sample_data <- merge(data.frame(sample_data), alpha_diversity, by = 0, all = T) %>% column_to_rownames(var = "Row.names")
}
phyloseq_unfiltered <- phyloseq
sample_data(phyloseq_unfiltered$phyloseq_rel) <- sample_data(alpha_diversity(phyloseq_unfiltered$phyloseq_count))
sample_data(phyloseq_unfiltered$phyloseq_count) <- sample_data(alpha_diversity(phyloseq_unfiltered$phyloseq_count))
sample_data(phyloseq_unfiltered$phyloseq_path_rpk) <- sample_data(alpha_diversity(phyloseq_unfiltered$phyloseq_path_rpk))
#Calculation of sample prevalence, standard deviation, median abundance across all samples for all bugs and making into a table.
#
#• In initial analysis we will not perform prevalence or abundance filtering (though we may consider this for secondary differential abundance analyses to manage p (features) > n (sample size) problem and issues with multiple hypothesis correction)
taxa_qc <- data.frame("species" = otu_table(subset_samples(phyloseq_unfiltered$phyloseq_count,
S.obs != 0 & sample_type %in% c("Mock", "BAL", "Nasal", "Sputum"))) %>%
t() %>% colnames(),
"prevalence" = ifelse(subset_samples(phyloseq_unfiltered$phyloseq_count, S.obs != 0 & sample_type %in% c("Mock", "BAL", "Nasal", "Sputum")) %>% otu_table() > 0, 1, 0) %>% t() %>% colSums(), #Prevalence of taxa
"mean_rel_abd" = subset_samples(phyloseq_unfiltered$phyloseq_count, S.obs != 0 & sample_type %in% c("Mock", "BAL", "Nasal", "Sputum")) %>%
otu_table() %>%
t() %>%
colMeans(na.rm = T) #mean relativ abundacne
)
function_qc <- data.frame("function" = otu_table(subset_samples(phyloseq_unfiltered$phyloseq_path_rpk, S.obs != 0 & sample_type %in% c("Mock", "BAL", "Nasal", "Sputum"))) %>% t() %>% colnames(),
"prevalence" = ifelse(subset_samples(phyloseq_unfiltered$phyloseq_path_rpk, S.obs != 0 & sample_type %in% c("Mock", "BAL", "Nasal", "Sputum")) %>% otu_table() > 0, 1, 0) %>% t() %>% colSums(), #Prevalence of taxa
"mean_rpk" = subset_samples(phyloseq_unfiltered$phyloseq_path_rpk, S.obs != 0 & sample_type %in% c("Mock", "BAL", "Nasal", "Sputum")) %>% otu_table() %>% t() %>% colMeans(na.rm = T) #mean relativ abundacne
)
hist(log10(taxa_qc$prevalence), xlab = "log10(Taxa prevalence)", main = "Histogram of prevalence of taxa")
Histogram of mean abundance
hist(log10(taxa_qc$mean_rel_abd), xlab = "log10(Mean relative abundance)", main = "Histogram of mean relative abundance")
decontam - stratified by sample type
Decontam analysis result stratified by sample type
Decontam analysis result stratified by sample type
#Stratified by sample type
sample_data(phyloseq_unfiltered$phyloseq_rel)$is.neg <- grepl("Neg", sample_data(phyloseq_unfiltered$phyloseq_rel)$sample_type)
contaminant <- data.frame()
#cat("decontam prevalence - all")
contaminant1 <-
data.frame("prevalence", "all", fix.empty.names = F,
phyloseq_unfiltered$phyloseq_rel %>% subset_samples(S.obs != 0 & sample_type != "Mock") %>%
isContaminant(., method="prevalence", neg = "is.neg", threshold = 0.1) %>% subset(.,.$contaminant) %>% row.names()
)
#cat("decontam prevalence - BAL")
contaminant2 <-
data.frame("prevalence", "BAL", fix.empty.names = F,
subset_samples(phyloseq_unfiltered$phyloseq_rel %>% subset_samples(S.obs != 0), sample_type %in% c("BAL", "Neg.")) %>%
isContaminant(., method="prevalence", neg = "is.neg", threshold = 0.1) %>% subset(.,.$contaminant) %>% row.names())
#cat("decontam prevalence - Nasal")
contaminant3 <-
data.frame("prevalence", "Nasal swab", fix.empty.names = F,
subset_samples(phyloseq_unfiltered$phyloseq_rel %>% subset_samples(S.obs != 0), sample_type %in% c("Nasal", "Neg.")) %>%
isContaminant(., method="prevalence", neg = "is.neg", threshold = 0.1) %>% subset(.,.$contaminant) %>% row.names())
#cat("prevalence", "decontam prevalence - Sputum")
contaminant4 <-
data.frame("prevalence", "Sputum", fix.empty.names = F,
subset_samples(phyloseq_unfiltered$phyloseq_rel %>% subset_samples(S.obs != 0), sample_type %in% c("Sputum", "Neg.")) %>%
isContaminant(., method="prevalence", neg = "is.neg", threshold = 0.1) %>% subset(.,.$contaminant) %>% row.names())
#cat("decontam frequency - All")
contaminant5 <-
data.frame("frequency", "all", fix.empty.names = F,
phyloseq_unfiltered$phyloseq_rel %>% subset_samples(S.obs != 0) %>%
isContaminant(method="frequency", conc="DNA_bac_well") %>% subset(.,.$contaminant) %>% row.names())
#cat("decontam frequency - BAL")
#contaminant6 <-
#data.frame("frequency", "BAL", fix.empty.names = F,
#subset_samples(phyloseq_unfiltered$phyloseq_rel %>% subset_samples(S.obs != 0), sample_type %in% c("BAL")) %>%
# isContaminant(method="frequency", conc="DNA_bac_well") %>% subset(.,.$contaminant) %>% row.names()
#)
#cat("decontam frequency - Nasal")
#contaminant7 <-
#data.frame("frequency", "Nasal swab", fix.empty.names = F,
#subset_samples(phyloseq_unfiltered$phyloseq_rel %>% subset_samples(S.obs != 0), sample_type %in% c("Nasal")) %>%
# isContaminant(method="frequency", conc="DNA_bac_well") %>% subset(.,.$contaminant) %>% row.names()
#)
#cat("decontam frequency - Sputum")
contaminant8 <-
data.frame("frequency", "Sputum", fix.empty.names = F,
subset_samples(phyloseq_unfiltered$phyloseq_rel %>% subset_samples(S.obs != 0), sample_type %in% c("Sputum")) %>%
isContaminant(method="frequency", conc="DNA_bac_well") %>% subset(.,.$contaminant) %>% row.names()
)
#cat("decontam combined - All")
contaminant9 <-
data.frame("combined", "all", fix.empty.names = F,
phyloseq_unfiltered$phyloseq_rel %>% subset_samples(S.obs != 0) %>%
isContaminant(method="combined", neg="is.neg", conc = "DNA_bac_well") %>% subset(.,.$contaminant) %>% row.names())
#cat("decontam combined - BAL")
contaminant10 <-
data.frame("combined", "BAL", fix.empty.names = F,
subset_samples(phyloseq_unfiltered$phyloseq_rel %>% subset_samples(S.obs != 0), sample_type %in% c("BAL", "Neg.")) %>%
isContaminant(method="combined", neg="is.neg", conc = "DNA_bac_well") %>% subset(.,.$contaminant) %>% row.names())
#cat("decontam combined - Nasal")
#contaminant11 <-
#data.frame("combined", "Nasal swab", fix.empty.names = F,
#subset_samples(phyloseq_unfiltered$phyloseq_rel %>% subset_samples(S.obs != 0), sample_type %in% c("Nasal", "Neg.")) %>%
# isContaminant(method="combined", neg="is.neg", conc = "DNA_bac_well") %>% subset(.,.$contaminant) %>% row.names()
#)
#cat("decontam combined - Sputum")
contaminant12 <-
data.frame("combined", "Sputum", fix.empty.names = F,
subset_samples(phyloseq_unfiltered$phyloseq_rel %>% subset_samples(S.obs != 0), sample_type %in% c("Sputum", "Neg.")) %>%
isContaminant(method="combined", neg="is.neg", conc = "DNA_bac_well") %>% subset(.,.$contaminant) %>% row.names())
#Some contaminant table was not included as they were empty. (Contaminant 6, 7, 11)
contaminant <- rbind(contaminant1, contaminant2, contaminant3, contaminant4, contaminant5, contaminant8, contaminant9, contaminant10, contaminant12)
names(contaminant) <- c("method", "sample_type", "contaminant")
cat("List of contaminant (prevalence method) - from stratified analysis\n\n")
## List of contaminant (prevalence method) - from stratified analysis
contaminant %>% subset(., .$method == "prevalence" & .$sample_type != "all") %>% .$contaminant %>% unique()
## [1] "Cutibacterium_acnes" "Thermoleophilum_album"
## [3] "Paludisphaera_borealis" "Escherichia_coli"
## [5] "Bacillus_intestinalis" "Listeria_monocytogenes"
## [7] "Microbacterium_ginsengisoli" "Microbacterium_laevaniformans"
## [9] "Arthrobacter_agilis" "Brevundimonas_diminuta"
## [11] "Bosea_vaviloviae" "Acetobacter_aceti"
## [13] "Alcaligenes_faecalis" "Pseudomonas_formosensis"
## [15] "Enterococcus_faecalis" "Lactobacillus_fermentum"
cat("Prevalence decontam table\n\n")
## Prevalence decontam table
contaminant %>% subset(., .$method == "prevalence") %>% .$sample_type %>% table
## .
## all BAL Nasal swab Sputum
## 16 14 12 12
cat("Frequency decontam table\n\n")
## Frequency decontam table
contaminant %>% subset(., .$method == "frequency") %>% .$sample_type %>% table
## .
## all Sputum
## 3 3
cat("Combined decontam table\n\n")
## Combined decontam table
contaminant %>% subset(., .$method == "combined") %>% .$sample_type %>% table
## .
## all BAL Sputum
## 6 1 2
matrix(nrow=3,ncol=4) %>% data.frame() %>% rename(" " = X1, "BAL" = X2, "Nasal swab" = X3, "Sputum" = X4) %>%
rownames_to_column("x") %>% mutate(x = c("Prevalence method", "Frequency method", "Combined"),
" " = c(16, 3, 6),
"BAL" = c(14, 0, 1),
"Nasal swab" = c(12, 0, 0),
"Sputum" = c(12, 3, 2)
) %>% column_to_rownames("x") %>%
kbl(format = "html") %>%
add_header_above(c(" " = 1, "Non-stratified" = 1, "Stratified" = 3)) %>%
kable_styling(full_width = 0, html_font = "serif")
| BAL | Nasal swab | Sputum | ||
|---|---|---|---|---|
| Prevalence method | 16 | 14 | 12 | 12 |
| Frequency method | 3 | 0 | 0 | 3 |
| Combined | 6 | 1 | 0 | 2 |
Red flags
Taxa with low prevalences were red-flagged
Prevalence/abundance filtering was employed
taxa_qc <- data.frame("species" = otu_table(subset_samples(phyloseq_unfiltered$phyloseq_count,
S.obs != 0 & sample_type %in% c("Mock", "BAL", "Nasal", "Sputum"))) %>%
t() %>% colnames(),
"prevalence" = ifelse(subset_samples(phyloseq_unfiltered$phyloseq_count, S.obs != 0 & sample_type %in% c("Mock", "BAL", "Nasal", "Sputum")) %>% otu_table() > 0, 1, 0) %>% t() %>% colSums(), #Prevalence of taxa
"mean_rel_abd" = subset_samples(phyloseq_unfiltered$phyloseq_count, S.obs != 0 & sample_type %in% c("Mock", "BAL", "Nasal", "Sputum")) %>%
otu_table() %>%
t() %>%
colMeans(na.rm = T) #mean relativ abundacne
)
function_qc <- data.frame("function" = otu_table(subset_samples(phyloseq_unfiltered$phyloseq_path_rpk, S.obs != 0 & sample_type %in% c("Mock", "BAL", "Nasal", "Sputum"))) %>% t() %>% colnames(),
"prevalence" = ifelse(subset_samples(phyloseq_unfiltered$phyloseq_path_rpk, S.obs != 0 & sample_type %in% c("Mock", "BAL", "Nasal", "Sputum")) %>% otu_table() > 0, 1, 0) %>% t() %>% colSums(), #Prevalence of taxa
"mean_rpk" = subset_samples(phyloseq_unfiltered$phyloseq_path_rpk, S.obs != 0 & sample_type %in% c("Mock", "BAL", "Nasal", "Sputum")) %>% otu_table() %>% t() %>% colMeans(na.rm = T) #mean relativ abundacne
)
red_flag_taxa <- data.frame(species = taxa_qc$species,
red_flag_prev_abd = ifelse(taxa_qc$prevalence < otu_table(phyloseq_unfiltered$phyloseq_rel) %>%
t %>% rownames() %>%
length * 0.05 & taxa_qc$mean_rel_abd < quantile(taxa_qc$mean_rel_abd, 0.75), 1,0)) %>%
mutate(red_flag_decontam_prev = species %in% (contaminant %>%
subset(., .$method == "prevalence" & .$sample_type != "all") %>%
.$contaminant %>% unique()))
red_flag_function <- data.frame(function. = function_qc$function., red_flag_prev_abd = ifelse(function_qc$prevalence < otu_table(phyloseq$phyloseq_path_rpk) %>% t %>% rownames() %>% length * 0.05 & function_qc$mean_rpk < quantile(function_qc$mean_rpk, 0.75), 1, 0))
#phyloseq of table 1
phyloseq$phyloseq_count_prev <- prune_taxa(subset(red_flag_taxa,
red_flag_taxa$red_flag_decontam_prev != 1)$species,
phyloseq$phyloseq_count)
#phyloseq for analysis
phyloseq$phyloseq_rel <- prune_taxa(subset(red_flag_taxa,
red_flag_taxa$red_flag_prev_abd != 1 & !red_flag_taxa$red_flag_decontam_prev)$species,
phyloseq$phyloseq_rel)
phyloseq$phyloseq_count <- prune_taxa(subset(red_flag_taxa,
red_flag_taxa$red_flag_prev_abd != 1 & !red_flag_taxa$red_flag_decontam_prev)$species,
phyloseq$phyloseq_count)
phyloseq$phyloseq_path_rpk <- prune_taxa(subset(red_flag_taxa, red_flag_function$red_flag_prev_abd != 1)$function., phyloseq$phyloseq_path_rpk)
#phyloseq$tree_phyloseq_count <- prune_taxa(subset(red_flag_taxa,
#red_flag_taxa$red_flag_prev_abd != 1 & !red_flag_taxa$red_flag_decontam_prev)$species,
#phyloseq$tree_phyloseq_count)
#phyloseq$tree_phyloseq_rel <- prune_taxa(subset(red_flag_taxa,
#red_flag_taxa$red_flag_prev_abd != 1 & !red_flag_taxa$red_flag_decontam_prev)$species,
#phyloseq$tree_phyloseq_rel)
Before anlayzing, alpha diversity indices were calculated for all filtred phyloseq objects
#Calculation of alpha diversity indices for filtered samples
alpha_diversity <- function(data) {
otu_table <- otu_table(data) %>% .[colSums(.) !=0]
S.obs <- rowSums(t(otu_table) != 0)
sample_data <- sample_data(data)
data_evenness <- vegan::diversity(t(otu_table)) / log(vegan::specnumber(t(otu_table))) # calculate evenness index using vegan package
data_shannon <- vegan::diversity(t(otu_table), index = "shannon") # calculate Shannon index using vegan package
data_hill <- exp(data_shannon) # calculate Hills index
data_dominance <- microbiome::dominance(otu_table, index = "all", rank = 1, aggregate = TRUE) # dominance (Berger-Parker index), etc.
data_invsimpson <- vegan::diversity(t(otu_table), index = "invsimpson") # calculate Shannon index using vegan package
alpha_diversity <- cbind(S.obs, data_shannon, data_hill, data_invsimpson, data_evenness,data_dominance) # combine all indices in one data table
sample_data <- merge(data.frame(sample_data), alpha_diversity, by = 0, all = T) %>% column_to_rownames(var = "Row.names")
}
sample_data(phyloseq$phyloseq_count_prev) <- sample_data(alpha_diversity(phyloseq$phyloseq_count_prev))
sample_data(phyloseq$phyloseq_rel) <- sample_data(alpha_diversity(phyloseq$phyloseq_count))
sample_data(phyloseq$phyloseq_count) <- sample_data(alpha_diversity(phyloseq$phyloseq_count))
sample_data(phyloseq$phyloseq_path_rpk) <- sample_data(alpha_diversity(phyloseq$phyloseq_path_rpk))
#sample_data(phyloseq$tree_phyloseq_count) <- sample_data(alpha_diversity(phyloseq$tree_phyloseq_count))
#phyloseq$tree_phyloseq_rel <- transform_sample_counts(phyloseq$tree_phyloseq_count, function (x) {x/sum(x)})
sample_data <- sample_data(phyloseq$phyloseq_count)
QC 3 results:
3.1. In initial analysis we will not perform prevalence or abundance filtering (though we may consider this for secondary differentialabundance analyses to manage p (features) > n (sample size) problem and issues with multiple hypothesis correction)
3.2. Red flags were made for taxa not satisfying the criteria (prev < 0.05 & mean rel < 0.75Q)
3.3. Although we don’t consider the prevalence of abundance at this time, we can consider their red-flags after running the DA analysis
Analysis
A1. Host DNA, bacterial DNA by smaple type and treatment
qPCR and sequencing results
qPCR result
#2A: Change in total DNA (qPCR)
f2a <- ggplot(sample_data, aes(x = sample_type, y = log10(DNA_host_nondil + DNA_bac_nondil))) +
geom_boxplot(aes(fill = treatment), lwd = 0.2) +
scale_fill_manual(values = c("#e31a1c", "#fb9a99", "#33a02c", "#b2df8a", "#1f78b4", "#a6cee3")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
ylab("log<sub>10</sub>(qPCR total DNA)<br>(ng/μL)") +
xlab("Sample type") +
theme_classic (base_size = 12, base_family = "serif") +
labs(tag = "A") +
#scale_x_discrete(label = c( "Mock", "Neg.", "BAL", "Nasal", "Sputum")) +
theme(plot.tag = element_text(size = 15), axis.title.y = element_markdown()) + # Plot title size
guides(fill = guide_legend(nrow = 1, title = "Treatment"))
#2B: Change in human DNA (qPCR)
f2b <- ggplot(sample_data, aes(x = sample_type, y = log10(DNA_host_nondil))) +
geom_boxplot(aes(fill = treatment), lwd = 0.2) +
scale_fill_manual(values = c("#e31a1c", "#fb9a99", "#33a02c", "#b2df8a", "#1f78b4", "#a6cee3")) +
ylab("log<sub>10</sub>(qPCR host DNA)<br>(ng/μL)") +
xlab("Sample type") +
theme_classic (base_size = 12, base_family = "serif")+
labs(tag = "B") +
#scale_x_discrete(label = c("BAL", "Nasal", "Sputum", "Mock", "Neg.")) +
theme(plot.tag = element_text(size = 15), axis.title.y = element_markdown()) + # Plot title size
guides(fill = guide_legend(nrow = 1, title = "Treatment"))
#2C: Change in 16S DNA (qPCR)
f2c <- ggplot(sample_data, aes(x = sample_type, y = log10(DNA_bac_nondil))) +
geom_boxplot(aes(fill = treatment), lwd = 0.2) +
scale_fill_manual(values = c("#e31a1c", "#fb9a99", "#33a02c", "#b2df8a", "#1f78b4", "#a6cee3")) +
ylab("log<sub>10</sub>(qPCR bacterial DNA)<br>(ng/μL)") +
xlab("Sample type") +
theme_classic (base_size = 12, base_family = "serif")+
labs(tag = "C") +
#scale_x_discrete(label = c("BAL", "Nasal", "Sputum", "Mock", "Neg.")) +
theme(plot.tag = element_text(size = 15), axis.title.y = element_markdown()) + # Plot title size
guides(fill = guide_legend(nrow = 1, title = "Treatment"))
#2D. Change in % host (qPCR)
f2d <- ggplot(sample_data, aes(x = sample_type, y = host_proportion)) +
geom_boxplot(aes(fill = treatment), lwd = 0.2) +
scale_fill_manual(values = c("#e31a1c", "#fb9a99", "#33a02c", "#b2df8a", "#1f78b4", "#a6cee3")) +
ylab("Host DNA ratio") +
xlab("Sample type") +
theme_classic (base_size = 12, base_family = "serif") +
labs(tag = "D") +
#scale_x_discrete(label = c("BAL", "Nasal", "Sputum", "Mock", "Neg.")) +
theme(plot.tag = element_text(size = 15), axis.title.y = element_markdown()) + # Plot title size
guides(fill = guide_legend(nrow = 1, title = "Treatment"))
#output for markdown
ggarrange(f2a, f2b, f2c, f2d, common.legend = T , align = "hv")
Figure 2. qPCR result of host depletion study. A. Total DNA B. Host DNA C. Bacterial DNA D. Host %
Sequencing result
f3a <- ggplot(sample_data, aes(x = sample_type, y = log10(Raw_reads))) +
geom_boxplot(aes(fill = treatment), lwd = 0.2) +
theme_classic (base_size = 12, base_family = "serif") +
scale_fill_manual(values = c("#e31a1c", "#fb9a99", "#33a02c", "#b2df8a", "#1f78b4", "#a6cee3"), name = "Treatment", labels = c("Control","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
scale_x_discrete(name ="Sample type") +
theme(axis.title.y = element_markdown(),
plot.tag = element_text(size = 15)) +
ylab("log<sub>10</sub>(raw reads)") +
labs(tag = "A") +
guides(fill = guide_legend(nrow = 1))
# - Host_mapped
f3b <- ggplot(sample_data, aes(x = sample_type, y = log10(Host_mapped))) +
theme_classic (base_size = 12, base_family = "serif")+
geom_boxplot(aes(fill = treatment), lwd = 0.2) +
scale_fill_manual(values = c("#e31a1c", "#fb9a99", "#33a02c", "#b2df8a", "#1f78b4", "#a6cee3"), name = "Treatment", labels = c("Control","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
scale_x_discrete(name ="Sample type")+
theme(axis.title.y = element_markdown(),
plot.tag = element_text(size = 15)) +
ylab("log<sub>10</sub>(host reads)") +
labs(tag = "B") +
guides(fill = guide_legend(nrow = 1))
# - % Host (we have used Host_mapped/Raw_reads in prior papers)
# - Final_reads
f3c <- ggplot(sample_data, aes(x = sample_type, y = log10(Final_reads))) +
geom_boxplot(aes(fill = treatment), lwd = 0.2) +
scale_fill_manual(values = c("#e31a1c", "#fb9a99", "#33a02c", "#b2df8a", "#1f78b4", "#a6cee3"), name = "Treatment", labels = c("Control","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
scale_x_discrete(name ="Sample type")+
ylab("log<sub>10</sub>(final reads)") +
theme_classic (base_size = 12, base_family = "serif") +
theme(axis.title.y = element_markdown(),
plot.tag = element_markdown(size = 15)) +
labs(tag = "C") +
guides(fill = guide_legend(nrow = 1))
# - % Host (we have used Host_mapped/Raw_reads in prior papers)
f3d <- ggplot(sample_data, aes(x = sample_type, y = sequencing_host_prop)) +
geom_boxplot(aes(fill = treatment), lwd = 0.2) +
theme_classic (base_size = 12, base_family = "serif")+
scale_fill_manual(values = c("#e31a1c", "#fb9a99", "#33a02c", "#b2df8a", "#1f78b4", "#a6cee3"), name = "Treatment", labels = c("Control","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
scale_x_discrete(name ="Sample type")+
theme(axis.title.y = element_markdown(),
plot.tag = element_text(size = 15)) +
ylab("Host ratio by sequencing") +
labs(tag = "D") +
guides(fill = guide_legend(nrow = 1))
ggarrange(f3a, f3b, f3c, f3d, common.legend = T, align = "hv")
Figure 3. Sequencing result of host depletion study. A. Total DNA B. Host DNA C. Bacterial DNA D. Host %
Results A1
1.1. Some changed were observed, for both host DNA and bacterial DNA.
1.2. Sequencing results need to be added
This will be Fig 2. of the manuscript, after removing positives and negatives
A2. Modeling on sequencing results
As some changed were observed after treatment, linear mixed effect models were employed for testing.
Test results
Library failure - ANOVA
Some samples failed in library prep. What type of sample were fragile to treatments?
glm ( library fail ~ sample_type + treatment + sample_type * treatment + subject_id )
glmer(lib_failed ~ sample_type + treatment + sample_type * treatment + (1|subject_id), data = sample_data %>% data.frame) %>%
Anova %>% data.frame(check.names = F) %>%
mutate(` ` = case_when(abs(`Pr(>Chisq)`) < 0.05 ~ "*", .default = " ")) %>%
rownames_to_column(var = "x") %>% mutate(x = gsub(":", " * ", x)) %>% column_to_rownames(var = "x") %>%
kbl(format = "html") %>% kable_styling(full_width = 0, html_font = "serif")
| Chisq | Df | Pr(>Chisq) | ||
|---|---|---|---|---|
| sample_type | 20.19828 | 4 | 0.0004563 |
|
| treatment | 41.67568 | 5 | 0.0000001 |
|
| sample_type * treatment | 92.13915 | 20 | 0.0000000 |
|
Library failure
glm ( sequencing fail ~ sample_type + treatment + sample_type * treatment + subject_id )
Nasals were fragile to lyPMA and Molysis
glmer(lib_failed ~ sample_type + treatment + sample_type * treatment + (1|subject_id), data = sample_data %>% data.frame) %>%
summary %>% .$coefficients %>% data.frame(check.names = F) %>%
mutate(` ` = case_when(abs(`t value`) > 2 ~ "*", .default = " ")) %>%
rownames_to_column(var = "x") %>% mutate(x = gsub("sample_type|treatment", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
column_to_rownames(var = "x") %>%
kbl(format = "html") %>% kable_styling(full_width = 0, html_font = "serif")
| Estimate | Std. Error | t value | ||
|---|---|---|---|---|
| (Intercept) | 0.0000000 | 0.1317706 | 0.0000000 | |
| Mock | 0.0000000 | 0.1863518 | 0.0000000 | |
| BAL | 0.0000000 | 0.1630767 | 0.0000000 | |
| Nasal | 0.0000000 | 0.1482523 | 0.0000000 | |
| Sputum | 0.0000000 | 0.1630767 | 0.0000000 | |
| lyPMA | 0.0000000 | 0.1125488 | 0.0000000 | |
| Benzonase | 0.0000000 | 0.1125488 | 0.0000000 | |
| Host zero | 0.0000000 | 0.1125488 | 0.0000000 | |
| Molysis | 0.0000000 | 0.1125488 | 0.0000000 | |
| QIAamp | 0.0000000 | 0.1125488 | 0.0000000 | |
| Mock * lyPMA | 0.0000000 | 0.1591681 | 0.0000000 | |
| BAL * lyPMA | 0.2000000 | 0.1627453 | 1.2289140 | |
| Nasal * lyPMA | 0.8187152 | 0.1541029 | 5.3127843 |
|
| Sputum * lyPMA | 0.0000000 | 0.1627453 | 0.0000000 | |
| Mock * Benzonase | 0.0000000 | 0.1591681 | 0.0000000 | |
| BAL * Benzonase | 0.0000000 | 0.1627453 | 0.0000000 | |
| Nasal * Benzonase | -0.0424433 | 0.1542275 | -0.2751992 | |
| Sputum * Benzonase | 0.0000000 | 0.1627453 | 0.0000000 | |
| Mock * Host zero | 0.0000000 | 0.1591681 | 0.0000000 | |
| BAL * Host zero | 0.2000000 | 0.1627453 | 1.2289140 | |
| Nasal * Host zero | 0.3575567 | 0.1542275 | 2.3183715 |
|
| Sputum * Host zero | 0.0000000 | 0.1627453 | 0.0000000 | |
| Mock * Molysis | 0.0000000 | 0.1591681 | 0.0000000 | |
| BAL * Molysis | 0.2000000 | 0.1627453 | 1.2289140 | |
| Nasal * Molysis | 0.7812848 | 0.1541029 | 5.0698914 |
|
| Sputum * Molysis | 0.0000000 | 0.1627453 | 0.0000000 | |
| Mock * QIAamp | 0.0000000 | 0.1591681 | 0.0000000 | |
| BAL * QIAamp | 0.0000000 | 0.1627453 | 0.0000000 | |
| Nasal * QIAamp | 0.0424433 | 0.1542275 | 0.2751992 | |
| Sputum * QIAamp | 0.0000000 | 0.1627453 | 0.0000000 |
Sequencing failure
Modeling of sequencing failure were not available due to low number of cases.
BAL079 - control & lyPMA failed sequencing.
sample_data(phyloseq$phyloseq_count) %>% data.frame %>% mutate(sequencing_fail = (S.obs == 0)) %>%
glmer(sequencing_fail ~ sample_type + treatment + sample_type * treatment + (1|subject_id), data = .) %>%
summary %>% .$coefficients %>% data.frame(check.names = F) %>%
mutate(` ` = case_when(abs(`t value`) > 2 ~ "*", .default = " ")) %>%
rownames_to_column(var = "x") %>% mutate(x = gsub("sample_type|treatment", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
column_to_rownames(var = "x") %>%
kbl(format = "html") %>% kable_styling(full_width = 0, html_font = "serif")
| Estimate | Std. Error | t value | ||
|---|---|---|---|---|
| (Intercept) | 0.1666667 | 0.0831711 | 2.0039018 |
|
| Mock | -0.1666667 | 0.1176217 | -1.4169726 | |
| BAL | 0.0333333 | 0.1231391 | 0.2706967 | |
| Nasal | -0.1666667 | 0.1050730 | -1.5861993 | |
| Sputum | -0.1666667 | 0.1231391 | -1.3534833 | |
| lyPMA | 0.2333333 | 0.1228702 | 1.8990225 | |
| Benzonase | 0.0333333 | 0.1228702 | 0.2712889 | |
| Host zero | 0.0333333 | 0.1228702 | 0.2712889 | |
| Molysis | -0.1666667 | 0.1228702 | -1.3564447 | |
| QIAamp | -0.1666667 | 0.1228702 | -1.3564447 | |
| Mock * lyPMA | -0.2333333 | 0.1737647 | -1.3428117 | |
| BAL * lyPMA | -0.2333333 | 0.1776701 | -1.3132957 | |
| Nasal * lyPMA | -0.2333333 | 0.1656948 | -1.4082118 | |
| Sputum * lyPMA | -0.2333333 | 0.1776701 | -1.3132957 | |
| Mock * Benzonase | -0.0333333 | 0.1737647 | -0.1918302 | |
| BAL * Benzonase | -0.2333333 | 0.1776701 | -1.3132957 | |
| Nasal * Benzonase | -0.0333333 | 0.1656948 | -0.2011731 | |
| Sputum * Benzonase | -0.0333333 | 0.1776701 | -0.1876137 | |
| Mock * Host zero | -0.0333333 | 0.1737647 | -0.1918302 | |
| BAL * Host zero | -0.2333333 | 0.1776701 | -1.3132957 | |
| Nasal * Host zero | -0.0333333 | 0.1656948 | -0.2011731 | |
| Sputum * Host zero | -0.0333333 | 0.1776701 | -0.1876137 | |
| Mock * Molysis | 0.1666667 | 0.1737647 | 0.9591512 | |
| BAL * Molysis | -0.0333333 | 0.1776701 | -0.1876137 | |
| Nasal * Molysis | 0.1666667 | 0.1656948 | 1.0058656 | |
| Sputum * Molysis | 0.1666667 | 0.1776701 | 0.9380684 | |
| Mock * QIAamp | 0.1666667 | 0.1737647 | 0.9591512 | |
| BAL * QIAamp | -0.0333333 | 0.1776701 | -0.1876137 | |
| Nasal * QIAamp | 0.1666667 | 0.1656948 | 1.0058655 | |
| Sputum * QIAamp | 0.1666667 | 0.1776701 | 0.9380684 |
log10(Final reads) - ANOVA
Which methods was effective in increasing the final reads?
Interaction term was significant
lmer(log10(Final_reads) ~ sample_type + treatment + sample_type * treatment + (1|subject_id), data = sample_data %>% data.frame) %>%
anova %>% data.frame(check.names = F) %>%
mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*", .default = " ")) %>%
rownames_to_column(var = "x") %>% mutate(x = gsub(":", " * ", x)) %>% column_to_rownames(var = "x") %>%
kbl(format = "html") %>% kable_styling(full_width = 0, html_font = "serif")
| Sum Sq | Mean Sq | NumDF | DenDF | F value | Pr(>F) | ||
|---|---|---|---|---|---|---|---|
| sample_type | 7.219703 | 1.8049257 | 4 | 10.20498 | 11.573353 | 0.0008358 |
|
| treatment | 16.897219 | 3.3794439 | 5 | 113.82452 | 21.669311 | 0.0000000 |
|
| sample_type * treatment | 14.273189 | 0.7136594 | 20 | 111.51262 | 4.576051 | 0.0000001 |
|
log10(Final reads)
Which methods was effective in increasing the final reads?
lmer( log10(Final reads) vs sample_type + treatment + sample_type * treatment + subject_id )
Except lyPMA, every methods increased final reads
lmer(log10(Final_reads) ~ sample_type + treatment + sample_type * treatment + (1|subject_id), data = sample_data %>% data.frame) %>%
summary %>% .$coefficients %>% data.frame(check.names = F) %>%
mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*", .default = " ")) %>%
rownames_to_column(var = "x") %>% mutate(x = gsub("sample_type|treatment", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
column_to_rownames(var = "x") %>%
kbl(format = "html") %>% kable_styling(full_width = 0, html_font = "serif")
| Estimate | Std. Error | df | t value | Pr(>|t|) | ||
|---|---|---|---|---|---|---|
| (Intercept) | 5.8969160 | 0.2896787 | 13.69170 | 20.3567474 | 0.0000000 |
|
| Mock | 2.0643827 | 0.4096675 | 13.69170 | 5.0391660 | 0.0001936 |
|
| BAL | -0.4036097 | 0.3559340 | 21.62209 | -1.1339451 | 0.2692339 | |
| Nasal | 0.6201093 | 0.3245018 | 17.83245 | 1.9109582 | 0.0722216 | |
| Sputum | -0.0703197 | 0.3559340 | 21.62209 | -0.1975637 | 0.8452319 | |
| lyPMA | 0.3624825 | 0.2391309 | 110.63626 | 1.5158327 | 0.1324137 | |
| Benzonase | 0.0678140 | 0.2391309 | 110.63626 | 0.2835852 | 0.7772583 | |
| Host zero | 0.2506613 | 0.2391309 | 110.63626 | 1.0482178 | 0.2968228 | |
| Molysis | 0.1413267 | 0.2391309 | 110.63626 | 0.5910015 | 0.5557249 | |
| QIAamp | 0.1588315 | 0.2391309 | 110.63626 | 0.6642031 | 0.5079422 | |
| Mock * lyPMA | -1.1298459 | 0.3381822 | 110.63626 | -3.3409380 | 0.0011392 |
|
| BAL * lyPMA | -0.0103440 | 0.3457828 | 110.63626 | -0.0299146 | 0.9761890 | |
| Nasal * lyPMA | -0.9035047 | 0.3277361 | 116.97484 | -2.7568060 | 0.0067736 |
|
| Sputum * lyPMA | 0.1775924 | 0.3457828 | 110.63626 | 0.5135953 | 0.6085591 | |
| Mock * Benzonase | -0.0827988 | 0.3381822 | 110.63626 | -0.2448348 | 0.8070379 | |
| BAL * Benzonase | 0.7430909 | 0.3457828 | 110.63626 | 2.1490108 | 0.0338131 |
|
| Nasal * Benzonase | 0.0750795 | 0.3280318 | 117.54161 | 0.2288786 | 0.8193610 | |
| Sputum * Benzonase | 0.7780564 | 0.3457828 | 110.63626 | 2.2501306 | 0.0264186 |
|
| Mock * Host zero | -0.2212240 | 0.3381822 | 110.63626 | -0.6541561 | 0.5143687 | |
| BAL * Host zero | 0.6995291 | 0.3457828 | 110.63626 | 2.0230305 | 0.0454812 |
|
| Nasal * Host zero | 0.6045330 | 0.3280318 | 117.54161 | 1.8429094 | 0.0678619 | |
| Sputum * Host zero | 1.4226694 | 0.3457828 | 110.63626 | 4.1143443 | 0.0000750 |
|
| Mock * Molysis | -0.1604345 | 0.3381822 | 110.63626 | -0.4744026 | 0.6361479 | |
| BAL * Molysis | 0.8944771 | 0.3457828 | 110.63626 | 2.5868180 | 0.0109839 |
|
| Nasal * Molysis | 0.0690814 | 0.3277361 | 116.97484 | 0.2107838 | 0.8334227 | |
| Sputum * Molysis | 1.8487779 | 0.3457828 | 110.63626 | 5.3466454 | 0.0000005 |
|
| Mock * QIAamp | -0.0570843 | 0.3381822 | 110.63626 | -0.1687973 | 0.8662643 | |
| BAL * QIAamp | 0.8892670 | 0.3457828 | 110.63626 | 2.5717504 | 0.0114447 |
|
| Nasal * QIAamp | 0.8921752 | 0.3280318 | 117.54161 | 2.7197824 | 0.0075237 |
|
| Sputum * QIAamp | 1.2569571 | 0.3457828 | 110.63626 | 3.6351063 | 0.0004232 |
|
Final reads x 10^6 - ANOVA
Which methods was effective in increasing the final reads?
Interaction term was significant
lmer(Final_reads/1000000 ~ sample_type + treatment + sample_type * treatment + (1|subject_id), data = sample_data %>% data.frame) %>%
anova %>% data.frame(check.names = F) %>%
mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*", .default = " ")) %>%
rownames_to_column(var = "x") %>% mutate(x = gsub(":", " * ", x)) %>% column_to_rownames(var = "x") %>%
kbl(format = "html") %>% kable_styling(full_width = 0, html_font = "serif")
| Sum Sq | Mean Sq | NumDF | DenDF | F value | Pr(>F) | ||
|---|---|---|---|---|---|---|---|
| sample_type | 112474.18 | 28118.545 | 4 | 2.0000 | 80.424737 | 0.0123190 |
|
| treatment | 13931.56 | 2786.311 | 5 | 114.6237 | 7.969415 | 0.0000018 |
|
| sample_type * treatment | 28755.96 | 1437.798 | 20 | 111.5071 | 4.112394 | 0.0000007 |
|
Final reads x 1000000
Which methods was effective in increasing the final reads?
lmer( log10(Final reads) vs sample_type + treatment + sample_type * treatment + subject_id )
Except lyPMA, every methods increased final reads
lmer(Final_reads/1000000 ~ sample_type + treatment + sample_type * treatment + (1|subject_id), data = sample_data %>% data.frame) %>%
summary %>% .$coefficients %>% data.frame(check.names = F) %>%
mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*", .default = " ")) %>%
rownames_to_column(var = "x") %>% mutate(x = gsub("sample_type|treatment", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
column_to_rownames(var = "x") %>%
kbl(format = "html") %>% kable_styling(full_width = 0, html_font = "serif")
| Estimate | Std. Error | df | t value | Pr(>|t|) | ||
|---|---|---|---|---|---|---|
| (Intercept) | 1.1853377 | 7.914937 | 10.70305 | 0.1497596 | 0.8837390 | |
| Mock | 91.9828077 | 11.193412 | 10.70305 | 8.2175847 | 0.0000060 |
|
| BAL | -0.8398793 | 11.551901 | 32.84722 | -0.0727049 | 0.9424819 | |
| Nasal | 4.1141399 | 9.901834 | 21.66085 | 0.4154927 | 0.6818684 | |
| Sputum | -0.4726565 | 11.551901 | 32.84722 | -0.0409159 | 0.9676106 | |
| lyPMA | 19.5717199 | 11.322369 | 111.31565 | 1.7285887 | 0.0866554 | |
| Benzonase | -0.2317301 | 11.322369 | 111.31565 | -0.0204666 | 0.9837078 | |
| Host zero | 0.3892183 | 11.322369 | 111.31565 | 0.0343761 | 0.9726388 | |
| Molysis | -0.0657525 | 11.322369 | 111.31565 | -0.0058073 | 0.9953769 | |
| QIAamp | -0.0234189 | 11.322369 | 111.31565 | -0.0020684 | 0.9983534 | |
| Mock * lyPMA | -68.4152037 | 16.012248 | 111.31565 | -4.2726795 | 0.0000409 |
|
| BAL * lyPMA | -18.9652355 | 16.372119 | 111.31565 | -1.1583861 | 0.2491857 | |
| Nasal * lyPMA | -23.5033597 | 15.281061 | 117.67552 | -1.5380712 | 0.1267158 | |
| Sputum * lyPMA | -17.4439427 | 16.372119 | 111.31565 | -1.0654664 | 0.2889705 | |
| Mock * Benzonase | -2.4988813 | 16.012248 | 111.31565 | -0.1560606 | 0.8762679 | |
| BAL * Benzonase | 2.0219105 | 16.372119 | 111.31565 | 0.1234972 | 0.9019361 | |
| Nasal * Benzonase | 4.0152471 | 15.281115 | 117.71961 | 0.2627588 | 0.7931958 | |
| Sputum * Benzonase | 4.4010481 | 16.372119 | 111.31565 | 0.2688136 | 0.7885707 | |
| Mock * Host zero | 5.1518451 | 16.012248 | 111.31565 | 0.3217440 | 0.7482500 | |
| BAL * Host zero | 5.0852749 | 16.372119 | 111.31565 | 0.3106058 | 0.7566812 | |
| Nasal * Host zero | 36.9180859 | 15.281115 | 117.71961 | 2.4159289 | 0.0172321 |
|
| Sputum * Host zero | 34.8337485 | 16.372119 | 111.31565 | 2.1276262 | 0.0355752 |
|
| Mock * Molysis | -4.3094677 | 16.012248 | 111.31565 | -0.2691357 | 0.7883235 | |
| BAL * Molysis | 8.8170029 | 16.372119 | 111.31565 | 0.5385377 | 0.5912812 | |
| Nasal * Molysis | 7.9851831 | 15.281061 | 117.67552 | 0.5225542 | 0.6022669 | |
| Sputum * Molysis | 67.6630733 | 16.372119 | 111.31565 | 4.1328232 | 0.0000697 |
|
| Mock * QIAamp | 23.2897791 | 16.012248 | 111.31565 | 1.4544978 | 0.1486223 | |
| BAL * QIAamp | 8.4962045 | 16.372119 | 111.31565 | 0.5189435 | 0.6048303 | |
| Nasal * QIAamp | 36.6438222 | 15.281115 | 117.71961 | 2.3979810 | 0.0180573 |
|
| Sputum * QIAamp | 21.5737389 | 16.372119 | 111.31565 | 1.3177121 | 0.1903056 |
Host ratio ANOVA
Which methods was effective in lowering host %
Interaction term was significant
lmer(sequencing_host_prop ~ sample_type + treatment + sample_type * treatment + (1|subject_id), data = sample_data %>% data.frame) %>%
anova %>% data.frame(check.names = F) %>%
mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*", .default = " ")) %>%
rownames_to_column(var = "x") %>% mutate(x = gsub(":", " * ", x)) %>% column_to_rownames(var = "x") %>%
kbl(format = "html") %>% kable_styling(full_width = 0, html_font = "serif")
| Sum Sq | Mean Sq | NumDF | DenDF | F value | Pr(>F) | ||
|---|---|---|---|---|---|---|---|
| sample_type | 1.140989 | 0.2852472 | 4 | 12.90241 | 19.94979 | 1.96e-05 |
|
| treatment | 1.844330 | 0.3688660 | 5 | 112.48557 | 25.79798 | 0.00e+00 |
|
| sample_type * treatment | 3.013480 | 0.1506740 | 20 | 111.00500 | 10.53793 | 0.00e+00 |
|
Host ratio
Which methods was effective in lowering host %
lmer( Host DNA ratio vs sample_type + treatment + sample_type * treatment + (1|subject_id) )
Host zero was effect to to all types. Molysis was effective to Nasal and sputum. QIAamp was effective for Nasal only.
lmer(sequencing_host_prop ~ sample_type + treatment + sample_type * treatment + (1|subject_id), data = sample_data %>% data.frame) %>%
summary %>% .$coefficients %>% data.frame(check.names = F) %>%
mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*", .default = " ")) %>%
rownames_to_column(var = "x") %>% mutate(x = gsub("sample_type|treatment", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
column_to_rownames(var = "x") %>%
kbl(format = "html") %>% kable_styling(full_width = 0, html_font = "serif")
| Estimate | Std. Error | df | t value | Pr(>|t|) | ||
|---|---|---|---|---|---|---|
| (Intercept) | 0.1580489 | 0.1236466 | 14.44554 | 1.2782307 | 0.2213255 | |
| Mock | -0.1555157 | 0.1748628 | 14.44554 | -0.8893586 | 0.3883812 | |
| BAL | 0.8383419 | 0.1439765 | 18.45998 | 5.8227692 | 0.0000147 |
|
| Nasal | 0.7942119 | 0.1341971 | 16.58294 | 5.9182492 | 0.0000187 |
|
| Sputum | 0.8317866 | 0.1439765 | 18.45998 | 5.7772389 | 0.0000162 |
|
| lyPMA | -0.0125376 | 0.0724064 | 109.71034 | -0.1731564 | 0.8628476 | |
| Benzonase | 0.0820089 | 0.0724064 | 109.71034 | 1.1326192 | 0.2598435 | |
| Host zero | 0.0513961 | 0.0724064 | 109.71034 | 0.7098273 | 0.4793172 | |
| Molysis | 0.0340392 | 0.0724064 | 109.71034 | 0.4701129 | 0.6392075 | |
| QIAamp | 0.0122709 | 0.0724064 | 109.71034 | 0.1694722 | 0.8657373 | |
| Mock * lyPMA | 0.0665440 | 0.1023982 | 109.71034 | 0.6498555 | 0.5171439 | |
| BAL * lyPMA | -0.0186970 | 0.1046995 | 109.71034 | -0.1785781 | 0.8585983 | |
| Nasal * lyPMA | -0.2655229 | 0.1001256 | 114.99868 | -2.6518985 | 0.0091339 |
|
| Sputum * lyPMA | -0.0255115 | 0.1046995 | 109.71034 | -0.2436637 | 0.8079463 | |
| Mock * Benzonase | -0.0782140 | 0.1023982 | 109.71034 | -0.7638225 | 0.4466126 | |
| BAL * Benzonase | -0.0934606 | 0.1046995 | 109.71034 | -0.8926556 | 0.3739954 | |
| Nasal * Benzonase | -0.2844798 | 0.1003189 | 115.63457 | -2.8357539 | 0.0053985 |
|
| Sputum * Benzonase | -0.1445493 | 0.1046995 | 109.71034 | -1.3806103 | 0.1702051 | |
| Mock * Host zero | -0.0466232 | 0.1023982 | 109.71034 | -0.4553132 | 0.6497838 | |
| BAL * Host zero | -0.2339486 | 0.1046995 | 109.71034 | -2.2344755 | 0.0274786 |
|
| Nasal * Host zero | -0.7898081 | 0.1003189 | 115.63457 | -7.8729711 | 0.0000000 |
|
| Sputum * Host zero | -0.5061958 | 0.1046995 | 109.71034 | -4.8347467 | 0.0000044 |
|
| Mock * Molysis | -0.0301512 | 0.1023982 | 109.71034 | -0.2944502 | 0.7689703 | |
| BAL * Molysis | -0.2110097 | 0.1046995 | 109.71034 | -2.0153834 | 0.0463093 |
|
| Nasal * Molysis | -0.5388071 | 0.1001256 | 114.99868 | -5.3813128 | 0.0000004 |
|
| Sputum * Molysis | -0.7303645 | 0.1046995 | 109.71034 | -6.9758137 | 0.0000000 |
|
| Mock * QIAamp | -0.0085052 | 0.1023982 | 109.71034 | -0.0830597 | 0.9339554 | |
| BAL * QIAamp | -0.0749318 | 0.1046995 | 109.71034 | -0.7156840 | 0.4757069 | |
| Nasal * QIAamp | -0.7638065 | 0.1003189 | 115.63457 | -7.6137819 | 0.0000000 |
|
| Sputum * QIAamp | -0.1992210 | 0.1046995 | 109.71034 | -1.9027875 | 0.0596896 |
Gram negatives - ANOVA
Which methods was changed gram-strain ratio?
Square root transformation was required
Interaction term was significant
hist(sample_data %>% data.frame %>% .$gram_neg_prop)
hist(sample_data %>% data.frame %>% .$gram_neg_prop %>% sqrt())
lmer(sqrt(gram_neg_prop) ~ sample_type + treatment + sample_type * treatment + (1|subject_id), data = sample_data %>% data.frame) %>%
anova %>% data.frame(check.names = F) %>%
mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*", .default = " ")) %>%
rownames_to_column(var = "x") %>% mutate(x = gsub(":", " * ", x)) %>% column_to_rownames(var = "x") %>%
kbl(format = "html") %>% kable_styling(full_width = 0, html_font = "serif")
| Sum Sq | Mean Sq | NumDF | DenDF | F value | Pr(>F) | ||
|---|---|---|---|---|---|---|---|
| sample_type | 0.2694336 | 0.0673584 | 4 | 15.21615 | 3.261401 | 0.0406059 |
|
| treatment | 2.4246546 | 0.4849309 | 5 | 110.60775 | 23.479689 | 0.0000000 |
|
| sample_type * treatment | 4.3971097 | 0.2198555 | 20 | 109.73140 | 10.645101 | 0.0000000 |
|
Gram negatives
Which method biased gram positive-negative ratio
lmer( Gram-negative ratio vs sample_type + treatment + sample_type * treatment + (1|subject_id) )
Some treatment (commercial) changed gram negative proportion
lmer(sqrt(gram_neg_prop) ~ sample_type + treatment + sample_type * treatment + (1|subject_id), data = sample_data %>% data.frame) %>%
summary %>% .$coefficients %>% data.frame(check.names = F) %>%
mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*", .default = " ")) %>%
rownames_to_column(var = "x") %>% mutate(x = gsub("sample_type|treatment", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
column_to_rownames(var = "x") %>%
kbl(format = "html") %>% kable_styling(full_width = 0, html_font = "serif")
| Estimate | Std. Error | df | t value | Pr(>|t|) | ||
|---|---|---|---|---|---|---|
| (Intercept) | 0.6047523 | 0.1990827 | 16.01065 | 3.0376933 | 0.0078331 |
|
| Mock | 0.1860382 | 0.2815455 | 16.01065 | 0.6607749 | 0.5181522 | |
| BAL | 0.0046103 | 0.2286078 | 19.28745 | 0.0201669 | 0.9841174 | |
| Nasal | -0.4881059 | 0.2128813 | 17.29861 | -2.2928551 | 0.0346432 |
|
| Sputum | 0.2582871 | 0.2258383 | 18.40344 | 1.1436815 | 0.2674149 | |
| lyPMA | -0.1039615 | 0.0870221 | 108.52867 | -1.1946562 | 0.2348257 | |
| Benzonase | -0.2799287 | 0.0870221 | 108.52867 | -3.2167546 | 0.0017090 |
|
| Host zero | 0.1510556 | 0.0870221 | 108.52867 | 1.7358307 | 0.0854319 | |
| Molysis | 0.1211041 | 0.0870221 | 108.52867 | 1.3916475 | 0.1668749 | |
| QIAamp | -0.0908688 | 0.0870221 | 108.52867 | -1.0442039 | 0.2987113 | |
| Mock * lyPMA | -0.0728087 | 0.1230678 | 108.52867 | -0.5916141 | 0.5553394 | |
| BAL * lyPMA | 0.1934377 | 0.1337888 | 108.52867 | 1.4458438 | 0.1511027 | |
| Nasal * lyPMA | 0.6303581 | 0.1210478 | 112.15168 | 5.2075150 | 0.0000009 |
|
| Sputum * lyPMA | -0.2141299 | 0.1258337 | 108.52867 | -1.7016893 | 0.0916767 | |
| Mock * Benzonase | -0.4969703 | 0.1230678 | 108.52867 | -4.0381827 | 0.0001007 |
|
| BAL * Benzonase | 0.3263289 | 0.1307391 | 108.92522 | 2.4960312 | 0.0140576 |
|
| Nasal * Benzonase | 0.3428788 | 0.1213791 | 112.64674 | 2.8248588 | 0.0055963 |
|
| Sputum * Benzonase | -0.1759531 | 0.1258337 | 108.52867 | -1.3982981 | 0.1648745 | |
| Mock * Host zero | -0.9405663 | 0.1230678 | 108.52867 | -7.6426670 | 0.0000000 |
|
| BAL * Host zero | -0.1969698 | 0.1307391 | 108.92522 | -1.5065868 | 0.1348113 | |
| Nasal * Host zero | -0.1463399 | 0.1213791 | 112.64674 | -1.2056436 | 0.2304813 | |
| Sputum * Host zero | -0.7742055 | 0.1258337 | 108.52867 | -6.1526073 | 0.0000000 |
|
| Mock * Molysis | -0.8235144 | 0.1230678 | 108.52867 | -6.6915495 | 0.0000000 |
|
| BAL * Molysis | -0.1661338 | 0.1307391 | 108.92522 | -1.2707275 | 0.2065329 | |
| Nasal * Molysis | -0.0095905 | 0.1210478 | 112.15168 | -0.0792289 | 0.9369917 | |
| Sputum * Molysis | -0.7649241 | 0.1258337 | 108.52867 | -6.0788481 | 0.0000000 |
|
| Mock * QIAamp | -0.6636793 | 0.1230678 | 108.52867 | -5.3927935 | 0.0000004 |
|
| BAL * QIAamp | 0.0758323 | 0.1307391 | 108.92522 | 0.5800275 | 0.5630940 | |
| Nasal * QIAamp | 0.1192747 | 0.1213791 | 112.64674 | 0.9826623 | 0.3278795 | |
| Sputum * QIAamp | -0.5568238 | 0.1258337 | 108.52867 | -4.4250758 | 0.0000230 |
|
Results
1. Library failure was associated with Nasal, especially after lyPMA and Molysis treatment
2. Benzonase, host-zero, Molysis, and QIAamp increased final reads
3. Host-zero lowered host %. For otheres, there were significant sample_type specific treatment efficiencies
A3. LM of taxa alpha diversity
Alpha diversity could be having changes due to treatment.
Both stratified and nonstratified analyses were conducted.
Figure - Alpha diversity
sample_data <- sample_data(phyloseq$phyloseq_count)
f4a <- ggplot(subset(sample_data(phyloseq$phyloseq_count), sample_data$sample_type %in% c("Sputum", "Nasal", "BAL", "Mock", "Neg.")), aes(y = S.obs)) +
geom_boxplot(aes(fill = treatment), lwd = 0.2) +
scale_fill_manual(values = c("#e31a1c", "#fb9a99", "#33a02c", "#b2df8a", "#1f78b4", "#a6cee3"), name = "Treatment", labels = c("Control","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
ylab("Species richness") +
theme_classic (base_size = 12, base_family = "serif") +
labs(tag = "A") +
theme(plot.tag = element_text(size = 15), axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
facet_wrap(~sample_type, nrow = 1) +
guides(fill = guide_legend(nrow = 1))
f4b <- ggplot(subset(sample_data(phyloseq$phyloseq_count), sample_data$sample_type %in% c("Sputum", "Nasal", "BAL", "Mock", "Neg.")), aes(y = data_invsimpson)) +
geom_boxplot(aes(fill = treatment), lwd = 0.2) +
#scale_fill_viridis(discrete = 6, name = "Treatment", labels = c("Control","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + # color using viridis
scale_fill_manual(values = c("#e31a1c", "#fb9a99", "#33a02c", "#b2df8a", "#1f78b4", "#a6cee3"), name = "Treatment", labels = c("Control","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
ylab("Inverse simpson") +
theme_classic (base_size = 12, base_family = "serif") +
labs(tag = "B") +
theme(plot.tag = element_text(size = 15), axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
facet_wrap(~sample_type, nrow = 1) +
guides(fill = guide_legend(nrow = 1))
ggarrange(f4a, f4b, common.legend = T, align = "hv", ncol = 1) # alpha diversity plots
Species richness
All samples:
S.obs ~ sample_type * treatment + log10 (Final_reads) + (1|original_sample)
Stratified:
S.obs ~ sample_type + log10 (Final_reads) + (1|original_sample)
Species richness (all samples & interaction term) - ANOVA
Interaction term was significant
sample_data <- sample_data(phyloseq$phyloseq_count) %>% data.frame(check.names = F) %>% subset(., !is.nan(.$simpson))
lmer_sob <- lmer(S.obs ~ sample_type * treatment + log10 (Final_reads) + (1|original_sample), data = sample_data)
lmer_sob %>%
anova() %>%
data.frame(check.names = F) %>%
mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*",
.default = " ")) %>%
kbl(format = "html") %>%
kable_styling(full_width = 0, html_font = "serif")
| Sum Sq | Mean Sq | NumDF | DenDF | F value | Pr(>F) | ||
|---|---|---|---|---|---|---|---|
| sample_type | 9339.068 | 2334.7671 | 4 | 18.65728 | 48.084830 | 0.0000000 |
|
| treatment | 1112.825 | 222.5649 | 5 | 32.39841 | 4.583753 | 0.0028414 |
|
| log10(Final_reads) | 2319.130 | 2319.1296 | 1 | 110.10414 | 47.762773 | 0.0000000 |
|
| sample_type:treatment | 16450.723 | 822.5361 | 20 | 56.70248 | 16.940238 | 0.0000000 |
|
Species richness (all samples & interaction term)
Increase at sputum was at every treatment Postive and negative control showed no changes
lmer(S.obs ~ sample_type * treatment + log10 (Final_reads) + (1|subject_id), data = sample_data) %>%
summary() %>%
.$coefficients %>%
data.frame(check.names = F) %>%
mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
.default = " ")) %>%
rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
column_to_rownames(var = "x") %>%
kbl(format = "html") %>%
kable_styling(full_width = 0, html_font = "serif")
| Estimate | Std. Error | df | t value | Pr(>|t|) | ||
|---|---|---|---|---|---|---|
| (Intercept) | -65.2137721 | 13.663089 | 58.06809 | -4.7729888 | 0.0000127 |
|
| Mock | 4.4374034 | 13.759942 | 19.41913 | 0.3224871 | 0.7505299 | |
| BAL | 7.1337567 | 10.875564 | 21.10034 | 0.6559436 | 0.5189497 | |
| Nasal | -0.8132815 | 10.160360 | 19.19598 | -0.0800446 | 0.9370304 | |
| Sputum | 13.4458550 | 10.715136 | 19.95934 | 1.2548469 | 0.2240295 | |
| lyPMA | -2.9525667 | 5.209605 | 103.96556 | -0.5667544 | 0.5721026 | |
| Benzonase | -2.5012734 | 4.662534 | 103.48455 | -0.5364623 | 0.5927905 | |
| Host zero | -4.9571049 | 4.694185 | 103.61256 | -1.0560097 | 0.2934199 | |
| Molysis | -3.1939654 | 4.399990 | 103.50236 | -0.7259028 | 0.4695366 | |
| QIAamp | -2.5957573 | 4.402037 | 103.51120 | -0.5896718 | 0.5566956 | |
| log10(Final_reads) | 11.5278137 | 1.682901 | 111.97999 | 6.8499667 | 0.0000000 |
|
| Mock * lyPMA | 5.9985885 | 7.041000 | 104.64472 | 0.8519512 | 0.3961867 | |
| BAL * lyPMA | -0.6767290 | 7.075936 | 103.51295 | -0.0956381 | 0.9239929 | |
| Nasal * lyPMA | 4.7115213 | 6.833865 | 106.98554 | 0.6894372 | 0.4920408 | |
| Sputum * lyPMA | 31.5266837 | 6.712682 | 103.47849 | 4.6965853 | 0.0000082 |
|
| Mock * Benzonase | -9.7259848 | 6.278580 | 103.47678 | -1.5490740 | 0.1244171 | |
| BAL * Benzonase | -2.2924838 | 6.739559 | 104.27989 | -0.3401534 | 0.7344252 | |
| Nasal * Benzonase | 0.3703339 | 6.193204 | 106.92939 | 0.0597968 | 0.9524291 | |
| Sputum * Benzonase | 55.7502373 | 6.509629 | 103.77908 | 8.5642729 | 0.0000000 |
|
| Mock * Host zero | -6.7822432 | 6.296671 | 103.53153 | -1.0771158 | 0.2839319 | |
| BAL * Host zero | 1.3576903 | 6.717893 | 104.20714 | 0.2021006 | 0.8402321 | |
| Nasal * Host zero | 4.2148945 | 6.255837 | 107.21605 | 0.6737539 | 0.5019177 | |
| Sputum * Host zero | 81.6672601 | 6.775316 | 104.48112 | 12.0536465 | 0.0000000 |
|
| Mock * Molysis | -5.3857637 | 6.086755 | 103.48748 | -0.8848334 | 0.3782977 | |
| BAL * Molysis | 8.2076150 | 6.626441 | 104.54515 | 1.2386160 | 0.2182617 | |
| Nasal * Molysis | 6.4462676 | 5.980617 | 106.70823 | 1.0778600 | 0.2835269 | |
| Sputum * Molysis | 84.8524102 | 6.921132 | 105.29508 | 12.2599029 | 0.0000000 |
|
| Mock * QIAamp | -10.3771663 | 6.079999 | 103.46628 | -1.7067710 | 0.0908642 | |
| BAL * QIAamp | -1.3323238 | 6.624385 | 104.53877 | -0.2011242 | 0.8409923 | |
| Nasal * QIAamp | -2.4363625 | 6.159171 | 107.11278 | -0.3955666 | 0.6932115 | |
| Sputum * QIAamp | 65.6748093 | 6.542365 | 104.40242 | 10.0383897 | 0.0000000 |
|
Species richness - stratified (Pos + Neg)
No treatment increased species richenss - after adjusting sequencing depth. With mock community except lyPMA, treatments showed they even reduced the possible contaminants. Need to observe alpha diversity of positive controls
lm(S.obs ~ sample_type * treatment + log10 (Final_reads), data = subset(sample_data, sample_data$sample_type == "Neg." | sample_data$sample_type == "Mock" )) %>%
summary() %>%
.$coefficients %>%
data.frame(check.names = F) %>%
mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
.default = " ")) %>%
rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
column_to_rownames(var = "x") %>%
kbl(format = "html") %>%
kable_styling(full_width = 0, html_font = "serif")
| Estimate | Std. Error | t value | Pr(>|t|) | ||
|---|---|---|---|---|---|
| (Intercept) | -57.761766 | 5.6845191 | -10.161241 | 0.0000000 |
|
| Mock | 7.100468 | 2.5001958 | 2.839965 | 0.0068070 |
|
| lyPMA | -2.046689 | 1.9442976 | -1.052663 | 0.2982424 | |
| Benzonase | -2.324790 | 1.6791901 | -1.384471 | 0.1731957 | |
| Host zero | -4.509954 | 1.7070781 | -2.641914 | 0.0113724 |
|
| Molysis | -2.974202 | 1.5867932 | -1.874348 | 0.0675314 | |
| QIAamp | -2.353753 | 1.5886026 | -1.481650 | 0.1455601 | |
| log10(Final_reads) | 10.257283 | 0.9503216 | 10.793487 | 0.0000000 |
|
| Mock * lyPMA | 4.117753 | 2.7538395 | 1.495277 | 0.1419805 | |
| Mock * Benzonase | -9.921507 | 2.2598585 | -4.390322 | 0.0000700 |
|
| Mock * Host zero | -7.191993 | 2.2758523 | -3.160132 | 0.0028518 |
|
| Mock * Molysis | -5.629804 | 2.1926081 | -2.567629 | 0.0137140 |
|
| Mock * QIAamp | -10.489897 | 2.1866232 | -4.797305 | 0.0000188 |
|
Species richness - stratified (NS + Pos + Neg)
Molysis and host zero may incrased speciess richness of Nasal Data include nasal swab, positive depletion, and negative depletion
lmer(S.obs ~ sample_type * treatment + log10 (Final_reads) + (1|subject_id), data = subset(sample_data, sample_data$sample_type == "Nasal" | sample_data$sample_type == "Mock" | sample_data$sample_type == "Neg.")) %>%
summary() %>%
.$coefficients %>%
data.frame(check.names = F) %>%
mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
.default = " ")) %>%
rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
column_to_rownames(var = "x") %>%
kbl(format = "html") %>%
kable_styling(full_width = 0, html_font = "serif")
| Estimate | Std. Error | df | t value | Pr(>|t|) | ||
|---|---|---|---|---|---|---|
| (Intercept) | -48.3698333 | 4.7155547 | 52.101386 | -10.2575065 | 0.0000000 |
|
| Mock | 10.4567880 | 3.2013447 | 9.219932 | 3.2663736 | 0.0094321 |
|
| Nasal | 1.0584267 | 2.2871939 | 8.360166 | 0.4627621 | 0.6553425 | |
| lyPMA | -0.9049918 | 1.9421510 | 65.381924 | -0.4659740 | 0.6427828 | |
| Benzonase | -2.1023643 | 1.7222997 | 64.251365 | -1.2206728 | 0.2266708 | |
| Host zero | -3.9463992 | 1.7382529 | 64.560072 | -2.2703251 | 0.0265330 |
|
| Molysis | -2.6972289 | 1.6258740 | 64.294648 | -1.6589409 | 0.1019986 | |
| QIAamp | -2.0487503 | 1.6269066 | 64.316109 | -1.2592919 | 0.2124760 | |
| log10(Final_reads) | 8.6560055 | 0.7266051 | 72.894750 | 11.9129429 | 0.0000000 |
|
| Mock * lyPMA | 1.7472931 | 2.6588054 | 66.840586 | 0.6571723 | 0.5133268 | |
| Nasal * lyPMA | 1.0548657 | 2.5419334 | 69.147897 | 0.4149856 | 0.6794372 | |
| Mock * Benzonase | -10.1679273 | 2.3189072 | 64.232457 | -4.3847927 | 0.0000440 |
|
| Nasal * Benzonase | 0.5052016 | 2.2565764 | 68.225974 | 0.2238797 | 0.8235200 | |
| Mock * Host zero | -7.7084106 | 2.3280333 | 64.365304 | -3.3111255 | 0.0015263 |
|
| Nasal * Host zero | 5.7835569 | 2.2862277 | 68.760586 | 2.5297378 | 0.0137105 |
|
| Mock * Molysis | -5.9373741 | 2.2485210 | 64.258498 | -2.6405686 | 0.0103784 |
|
| Nasal * Molysis | 6.6091517 | 2.1806238 | 68.119896 | 3.0308536 | 0.0034459 |
|
| Mock * QIAamp | -10.6319746 | 2.2451113 | 64.206853 | -4.7356113 | 0.0000125 |
|
| Nasal * QIAamp | -0.0884937 | 2.2700007 | 68.714798 | -0.0389840 | 0.9690162 |
Species richness (BAL + Pos + Neg)
No changes observed
lmer(S.obs ~ sample_type * treatment + log10 (Final_reads) + (1|subject_id), data = subset(sample_data, sample_data$sample_type == "BAL" | sample_data$sample_type == "Mock" | sample_data$sample_type == "Neg.")) %>%
summary() %>%
.$coefficients %>%
data.frame(check.names = F) %>%
mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
.default = " ")) %>%
rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
column_to_rownames(var = "x") %>%
kbl(format = "html") %>%
kable_styling(full_width = 0, html_font = "serif")
| Estimate | Std. Error | df | t value | Pr(>|t|) | ||
|---|---|---|---|---|---|---|
| (Intercept) | -64.1494503 | 10.085114 | 14.633239 | -6.3608059 | 0.0000144 |
|
| Mock | 4.8177516 | 10.407626 | 4.516828 | 0.4629059 | 0.6648679 | |
| BAL | 7.1148889 | 8.088054 | 4.579753 | 0.8796787 | 0.4227828 | |
| lyPMA | -2.8231861 | 3.099515 | 62.173149 | -0.9108477 | 0.3658936 | |
| Benzonase | -2.4760675 | 2.739453 | 62.047185 | -0.9038546 | 0.3695685 | |
| Host zero | -4.8932412 | 2.767327 | 62.081383 | -1.7682194 | 0.0819374 | |
| Molysis | -3.1625780 | 2.586407 | 62.051972 | -1.2227689 | 0.2260409 | |
| QIAamp | -2.5611934 | 2.588212 | 62.054347 | -0.9895610 | 0.3262317 | |
| log10(Final_reads) | 11.3463521 | 1.211568 | 63.597673 | 9.3650145 | 0.0000000 |
|
| Mock * lyPMA | 5.7299609 | 4.262556 | 62.339898 | 1.3442545 | 0.1837376 | |
| BAL * lyPMA | -0.7332389 | 4.160543 | 62.054816 | -0.1762364 | 0.8606821 | |
| Mock * Benzonase | -9.7539099 | 3.688201 | 62.045094 | -2.6446250 | 0.0103453 |
|
| BAL * Benzonase | -2.2191716 | 3.993078 | 62.345793 | -0.5557546 | 0.5803676 | |
| Mock * Host zero | -6.8407652 | 3.704152 | 62.059792 | -1.8467832 | 0.0695476 | |
| BAL * Host zero | 1.4176198 | 3.974005 | 62.323079 | 0.3567232 | 0.7225040 | |
| Mock * Molysis | -5.4206184 | 3.576524 | 62.047973 | -1.5156107 | 0.1346960 | |
| BAL * Molysis | 8.3155563 | 3.947315 | 62.427384 | 2.1066363 | 0.0391749 |
|
| Mock * QIAamp | -10.3932670 | 3.570564 | 62.042263 | -2.9108196 | 0.0050010 |
|
| BAL * QIAamp | -1.2253279 | 3.945518 | 62.425549 | -0.3105620 | 0.7571682 |
Species richness (sputum + Pos + Neg)
Benzonase may incrased speciess richness of sputum
lmer(S.obs ~ sample_type * treatment + log10 (Final_reads) + (1|original_sample), data = subset(sample_data, sample_data$sample_type == "Sputum" | sample_data$sample_type == "Mock" | sample_data$sample_type == "Neg.")) %>%
summary() %>%
.$coefficients %>%
data.frame(check.names = F) %>%
mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
.default = " ")) %>%
rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
column_to_rownames(var = "x") %>%
kbl(format = "html") %>%
kable_styling(full_width = 0, html_font = "serif")
| Estimate | Std. Error | df | t value | Pr(>|t|) | ||
|---|---|---|---|---|---|---|
| (Intercept) | -80.8428494 | 24.483664 | 16.275172 | -3.3019098 | 0.0044172 |
|
| Mock | 1.5271401 | 24.710462 | 4.563959 | 0.0618014 | 0.9533349 | |
| Sputum | 15.5782972 | 18.946159 | 4.382895 | 0.8222404 | 0.4533281 | |
| lyPMA | -2.5612761 | 6.285971 | 62.018467 | -0.4074591 | 0.6850739 | |
| Benzonase | -0.7801785 | 5.621157 | 62.002595 | -0.1387932 | 0.8900634 | |
| Host zero | -3.7294816 | 5.650321 | 62.005034 | -0.6600477 | 0.5116677 | |
| Molysis | -1.5517775 | 5.332781 | 62.002602 | -0.2909884 | 0.7720314 | |
| QIAamp | -0.9941171 | 24.057936 | 4.102715 | -0.0413218 | 0.9689724 | |
| log10(Final_reads) | 13.8441978 | 2.917530 | 62.236606 | 4.7451780 | 0.0000126 |
|
| Mock * lyPMA | 7.4827211 | 25.045276 | 4.815427 | 0.2987678 | 0.7775786 | |
| Sputum * lyPMA | 29.8843722 | 7.882902 | 62.002590 | 3.7910369 | 0.0003422 |
|
| Mock * Benzonase | -11.3144543 | 24.641626 | 4.514500 | -0.4591602 | 0.6673703 | |
| Sputum * Benzonase | 52.0697817 | 7.949374 | 62.026246 | 6.5501735 | 0.0000000 |
|
| Mock * Host zero | -7.9801398 | 24.646055 | 4.517726 | -0.3237897 | 0.7605421 | |
| Sputum * Host zero | 76.5635602 | 8.684459 | 62.060665 | 8.8161579 | 0.0000000 |
|
| Mock * Molysis | -6.8857757 | 24.577576 | 4.467936 | -0.2801650 | 0.7918519 | |
| Sputum * Molysis | 78.6003757 | 9.317212 | 62.092483 | 8.4360402 | 0.0000000 |
|
| Mock * QIAamp | -12.1165773 | 24.578355 | 4.468499 | -0.4929775 | 0.6452841 | |
| Sputum * QIAamp | 60.7936588 | 24.905063 | 4.709133 | 2.4410161 | 0.0616547 |
Simpson
Inverse Simpson of all samples:
Inverse Simpson ~ sample_type * treatment + log10(Final_reads) + (1|original_sample)
Stratified:
Inverse Simpson ~ treatment + (1|original_sample)
Inv Simp - ANOVA
Final reads did not affect inverse Simpson
lmer_invsimpson <- lmer(data_invsimpson ~ sample_type * treatment + log10(Final_reads) + (1|subject_id), data = sample_data)
lmer_invsimpson %>%
anova() %>%
data.frame(check.names = F) %>%
mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*",
.default = " ")) %>%
rownames_to_column(var = "x") %>% mutate(x = gsub(":", " * ", x)) %>%
column_to_rownames(var = "x") %>%
kbl(format = "html") %>%
kable_styling(full_width = 0, html_font = "serif")
| Sum Sq | Mean Sq | NumDF | DenDF | F value | Pr(>F) | ||
|---|---|---|---|---|---|---|---|
| sample_type | 81.2368012 | 20.3092003 | 4 | 16.50561 | 7.3006500 | 0.0014019 |
|
| treatment | 27.6140161 | 5.5228032 | 5 | 107.12281 | 1.9853097 | 0.0866036 | |
| log10(Final_reads) | 0.0591254 | 0.0591254 | 1 | 113.91246 | 0.0212541 | 0.8843465 | |
| sample_type * treatment | 180.1300010 | 9.0065000 | 20 | 105.21082 | 3.2376117 | 0.0000482 |
|
lmer_invsimpson <- lmer(data_invsimpson ~ sample_type * treatment + (1|subject_id), data = sample_data)
Simpson (all samples & interaction term)
Sputum after treatment showed differences - stratified analysis is required
#Simpson
lmer_invsimpson %>%
summary() %>%
.$coefficients %>%
data.frame(check.names = F) %>%
mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
.default = " ")) %>%
rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
column_to_rownames(var = "x") %>%
kbl(format = "html") %>%
kable_styling(full_width = 0, html_font = "serif")
| Estimate | Std. Error | df | t value | Pr(>|t|) | ||
|---|---|---|---|---|---|---|
| (Intercept) | 1.3904654 | 1.993990 | 17.46860 | 0.6973281 | 0.4947745 | |
| Mock | 0.9229546 | 2.803582 | 17.07082 | 0.3292055 | 0.7460049 | |
| BAL | 1.1225758 | 2.319270 | 22.03422 | 0.4840212 | 0.6331468 | |
| Nasal | 0.7040424 | 2.143412 | 19.24684 | 0.3284681 | 0.7461046 | |
| Sputum | 1.4619509 | 2.283075 | 20.77897 | 0.6403429 | 0.5289485 | |
| lyPMA | -0.1230344 | 1.212760 | 104.64849 | -0.1014499 | 0.9193875 | |
| Benzonase | -0.2877179 | 1.113992 | 104.64849 | -0.2582766 | 0.7967007 | |
| Host zero | -0.2454891 | 1.113992 | 104.64849 | -0.2203689 | 0.8260132 | |
| Molysis | -0.3864825 | 1.050281 | 104.64849 | -0.3679800 | 0.7136314 | |
| QIAamp | 0.0256096 | 1.050281 | 104.64849 | 0.0243836 | 0.9805931 | |
| Mock * lyPMA | 1.0668858 | 1.575422 | 104.64849 | 0.6772064 | 0.4997693 | |
| BAL * lyPMA | -0.2147395 | 1.688091 | 104.64849 | -0.1272085 | 0.8990195 | |
| Nasal * lyPMA | -0.4597750 | 1.550750 | 107.84461 | -0.2964854 | 0.7674296 | |
| Sputum * lyPMA | 3.3843171 | 1.604331 | 104.64849 | 2.1094879 | 0.0372876 |
|
| Mock * Benzonase | -0.2514286 | 1.500714 | 104.64849 | -0.1675393 | 0.8672691 | |
| BAL * Benzonase | -0.1667498 | 1.584503 | 105.08292 | -0.1052379 | 0.9163876 | |
| Nasal * Benzonase | 0.2138313 | 1.477869 | 108.62800 | 0.1446890 | 0.8852245 | |
| Sputum * Benzonase | 8.1281973 | 1.531035 | 104.64849 | 5.3089563 | 0.0000006 |
|
| Mock * Host zero | -0.4010042 | 1.500714 | 104.64849 | -0.2672089 | 0.7898342 | |
| BAL * Host zero | -0.6590815 | 1.584503 | 105.08292 | -0.4159548 | 0.6782913 | |
| Nasal * Host zero | 0.0087005 | 1.477869 | 108.62800 | 0.0058872 | 0.9953135 | |
| Sputum * Host zero | 6.1920480 | 1.531035 | 104.64849 | 4.0443546 | 0.0001006 |
|
| Mock * Molysis | -0.2112409 | 1.454049 | 104.64849 | -0.1452778 | 0.8847710 | |
| BAL * Molysis | 2.2654447 | 1.540377 | 105.10805 | 1.4707076 | 0.1443576 | |
| Nasal * Molysis | 0.3952162 | 1.427281 | 108.39576 | 0.2769015 | 0.7823831 | |
| Sputum * Molysis | 6.8905655 | 1.485322 | 104.64849 | 4.6391055 | 0.0000102 |
|
| Mock * QIAamp | -0.6311075 | 1.454049 | 104.64849 | -0.4340346 | 0.6651568 | |
| BAL * QIAamp | -0.7239017 | 1.540377 | 105.10805 | -0.4699509 | 0.6393637 | |
| Nasal * QIAamp | 0.3653475 | 1.430458 | 108.88195 | 0.2554059 | 0.7988914 | |
| Sputum * QIAamp | 4.6198553 | 1.485322 | 104.64849 | 3.1103392 | 0.0024082 |
|
Inverse Simpson - stratified (Controls)
Inverse Simpson ~ sample_type + log10 (Final_reads) + (1|original_sample)
Mock community treated with lyPMA showed cahnges in alpha diveresity
lmer(data_invsimpson ~ sample_type * treatment + (1|subject_id), data = subset(sample_data, sample_data$sample_type == "Mock" | sample_data$sample_type == "Neg.")) %>%
summary() %>%
.$coefficients %>%
data.frame(check.names = F) %>%
mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
.default = " ")) %>%
rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
column_to_rownames(var = "x") %>%
kbl(format = "html") %>%
kable_styling(full_width = 0, html_font = "serif")
| Estimate | Std. Error | df | t value | Pr(>|t|) | ||
|---|---|---|---|---|---|---|
| (Intercept) | 1.3904654 | 0.3127895 | 45 | 4.4453711 | 0.0000568 |
|
| Mock | 0.9229546 | 0.4372993 | 45 | 2.1105789 | 0.0403988 |
|
| lyPMA | -0.1230344 | 0.2666497 | 45 | -0.4614084 | 0.6467283 | |
| Benzonase | -0.2877179 | 0.2449334 | 45 | -1.1746783 | 0.2463022 | |
| Host zero | -0.2454891 | 0.2449334 | 45 | -1.0022688 | 0.3215730 | |
| Molysis | -0.3864825 | 0.2309254 | 45 | -1.6736248 | 0.1011428 | |
| QIAamp | 0.0256096 | 0.2309254 | 45 | 0.1108998 | 0.9121889 | |
| Mock * lyPMA | 1.0668858 | 0.3463881 | 45 | 3.0800301 | 0.0035232 |
|
| Mock * Benzonase | -0.2514286 | 0.3299622 | 45 | -0.7619923 | 0.4500401 | |
| Mock * Host zero | -0.4010042 | 0.3299622 | 45 | -1.2153035 | 0.2305882 | |
| Mock * Molysis | -0.2112409 | 0.3197018 | 45 | -0.6607438 | 0.5121452 | |
| Mock * QIAamp | -0.6311075 | 0.3197018 | 45 | -1.9740507 | 0.0545336 |
Inverse Simpson - stratified (NS + Pos + Neg)
Inverse Simpson ~ sample_type + log10 (Final_reads) + (1|original_sample)
Mock community treated with lyPMA only showed changes in alpha diveresity.
lmer(data_invsimpson ~ sample_type * treatment + (1|subject_id), data = subset(sample_data, sample_data$sample_type == "Nasal" | sample_data$sample_type == "Mock" | sample_data$sample_type == "Neg.")) %>%
summary() %>%
.$coefficients %>%
data.frame(check.names = F) %>%
mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
.default = " ")) %>%
rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
column_to_rownames(var = "x") %>%
kbl(format = "html") %>%
kable_styling(full_width = 0, html_font = "serif")
| Estimate | Std. Error | df | t value | Pr(>|t|) | ||
|---|---|---|---|---|---|---|
| (Intercept) | 1.3904654 | 0.5652050 | 8.107287 | 2.4601081 | 0.0389326 |
|
| Mock | 0.9229546 | 0.7944323 | 7.910318 | 1.1617789 | 0.2791780 | |
| Nasal | 0.7040424 | 0.6083586 | 8.991409 | 1.1572819 | 0.2769754 | |
| lyPMA | -0.1230344 | 0.3530610 | 65.678631 | -0.3484792 | 0.7285949 | |
| Benzonase | -0.2877179 | 0.3243072 | 65.678631 | -0.8871771 | 0.3782208 | |
| Host zero | -0.2454891 | 0.3243072 | 65.678631 | -0.7569646 | 0.4517781 | |
| Molysis | -0.3864825 | 0.3057598 | 65.678631 | -1.2640070 | 0.2106953 | |
| QIAamp | 0.0256096 | 0.3057598 | 65.678631 | 0.0837572 | 0.9335042 | |
| Mock * lyPMA | 1.0668858 | 0.4586397 | 65.678631 | 2.3261961 | 0.0231044 |
|
| Nasal * lyPMA | -0.4609085 | 0.4512512 | 67.878495 | -1.0214011 | 0.3106910 | |
| Mock * Benzonase | -0.2514286 | 0.4368907 | 65.678631 | -0.5754955 | 0.5669228 | |
| Nasal * Benzonase | 0.2148218 | 0.4299931 | 68.402589 | 0.4995936 | 0.6189645 | |
| Mock * Host zero | -0.4010042 | 0.4368907 | 65.678631 | -0.9178593 | 0.3620509 | |
| Nasal * Host zero | 0.0096909 | 0.4299931 | 68.402589 | 0.0225373 | 0.9820850 | |
| Mock * Molysis | -0.2112409 | 0.4233052 | 65.678631 | -0.4990275 | 0.6194274 | |
| Nasal * Molysis | 0.3963498 | 0.4152887 | 68.250206 | 0.9543958 | 0.3432515 | |
| Mock * QIAamp | -0.6311075 | 0.4233052 | 65.678631 | -1.4909040 | 0.1407741 | |
| Nasal * QIAamp | 0.3643570 | 0.4161826 | 68.572171 | 0.8754740 | 0.3843724 |
Inverse Simpson - stratified (BAL + Pos + Neg)
Nothing changed in BAL
lmer(data_invsimpson ~ sample_type * treatment + (1|original_sample), data = subset(sample_data, sample_data$sample_type == "BAL" |sample_data$sample_type == "Mock" | sample_data$sample_type == "Neg.")) %>%
summary() %>%
.$coefficients %>%
data.frame(check.names = F) %>%
mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
.default = " ")) %>%
rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
column_to_rownames(var = "x") %>%
kbl(format = "html") %>%
kable_styling(full_width = 0, html_font = "serif")
| Estimate | Std. Error | df | t value | Pr(>|t|) | ||
|---|---|---|---|---|---|---|
| (Intercept) | 1.4863671 | 1.0937033 | 4.313918 | 1.3590223 | 0.2408573 | |
| Mock | 0.7906648 | 1.5186971 | 4.008761 | 0.5206204 | 0.6300640 | |
| BAL | 0.9448441 | 1.3390001 | 6.635707 | 0.7056341 | 0.5044248 | |
| lyPMA | -0.2189361 | 1.0012729 | 60.495915 | -0.2186578 | 0.8276526 | |
| Benzonase | -0.3836196 | 0.9269986 | 60.495915 | -0.4138298 | 0.6804623 | |
| Host zero | -0.3413908 | 0.9269986 | 60.495915 | -0.3682754 | 0.7139535 | |
| Molysis | -0.4823842 | 0.8794281 | 60.495915 | -0.5485203 | 0.5853534 | |
| QIAamp | -0.0702921 | 1.5186971 | 4.008761 | -0.0462845 | 0.9652975 | |
| Mock * lyPMA | 1.1991757 | 1.7952871 | 7.771351 | 0.6679576 | 0.5234981 | |
| BAL * lyPMA | -0.1188378 | 1.3645049 | 60.495915 | -0.0870922 | 0.9308858 | |
| Mock * Benzonase | -0.1191388 | 1.7549458 | 7.113848 | -0.0678874 | 0.9477440 | |
| BAL * Benzonase | 0.0109819 | 1.2831549 | 61.060524 | 0.0085585 | 0.9931993 | |
| Mock * Host zero | -0.2687143 | 1.7549458 | 7.113848 | -0.1531183 | 0.8825579 | |
| BAL * Host zero | -0.4813498 | 1.2831549 | 61.060524 | -0.3751299 | 0.7088638 | |
| Mock * Molysis | -0.0789511 | 1.7302896 | 6.730905 | -0.0456288 | 0.9649302 | |
| BAL * Molysis | 2.4431764 | 1.2492213 | 61.090734 | 1.9557594 | 0.0550709 | |
| Mock * QIAamp | -0.4988176 | 1.7302896 | 6.730905 | -0.2882856 | 0.7818025 | |
| BAL * QIAamp | -0.5461700 | 1.7588636 | 7.131076 | -0.3105244 | 0.7650419 |
Inverse Simpson - stratified (spt + Pos + Neg)
Sputum changed after some treatment - but their changes were not treatment global.
lmer(data_invsimpson ~ sample_type * treatment + log10(Final_reads) + + (1|original_sample), data = subset(sample_data, sample_data$sample_type == "Sputum" |sample_data$sample_type == "Mock" | sample_data$sample_type == "Neg.")) %>%
summary() %>%
.$coefficients %>%
data.frame(check.names = F) %>%
mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
.default = " ")) %>%
rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
column_to_rownames(var = "x") %>%
kbl(format = "html") %>%
kable_styling(full_width = 0, html_font = "serif")
| Estimate | Std. Error | df | t value | Pr(>|t|) | ||
|---|---|---|---|---|---|---|
| (Intercept) | 0.1057716 | 5.2905539 | 15.486710 | 0.0199925 | 0.9843046 | |
| Mock | 0.3471655 | 5.4076909 | 4.532667 | 0.0641985 | 0.9515458 | |
| Sputum | 1.4104159 | 4.1483941 | 4.361819 | 0.3399908 | 0.7496129 | |
| lyPMA | -0.3469514 | 1.3400144 | 62.017636 | -0.2589162 | 0.7965574 | |
| Benzonase | -0.3799782 | 1.1982881 | 62.002626 | -0.3171009 | 0.7522324 | |
| Host zero | -0.3866053 | 1.2045058 | 62.004932 | -0.3209659 | 0.7493156 | |
| Molysis | -0.4865549 | 1.1368136 | 62.002632 | -0.4279989 | 0.6701350 | |
| QIAamp | -0.0784773 | 5.2722708 | 4.097262 | -0.0148849 | 0.9888207 | |
| log10(Final_reads) | 0.2293327 | 0.6219757 | 62.224062 | 0.3687164 | 0.7135907 | |
| Mock * lyPMA | 1.5014814 | 5.4772326 | 4.769507 | 0.2741314 | 0.7954573 | |
| Sputum * lyPMA | 3.4843773 | 1.6804348 | 62.002620 | 2.0734975 | 0.0422870 |
|
| Mock * Benzonase | -0.1210347 | 5.3933839 | 4.485942 | -0.0224413 | 0.9830593 | |
| Sputum * Benzonase | 8.0264719 | 1.6946139 | 62.024994 | 4.7364607 | 0.0000131 |
|
| Mock * Host zero | -0.2319418 | 5.3943035 | 4.488983 | -0.0429975 | 0.9675494 | |
| Sputum * Host zero | 5.9494149 | 1.8513301 | 62.057552 | 3.2135895 | 0.0020806 |
|
| Mock * Molysis | -0.0720894 | 5.3800864 | 4.442034 | -0.0133993 | 0.9898898 | |
| Sputum * Molysis | 6.5342420 | 1.9862327 | 62.087654 | 3.2897665 | 0.0016551 |
|
| Mock * QIAamp | -0.5156575 | 5.3802481 | 4.442565 | -0.0958427 | 0.9278159 | |
| Sputum * QIAamp | 4.3992555 | 5.4481017 | 4.669412 | 0.8074841 | 0.4585267 |
*** Results: ***
3.1. Species richness - type * method specific. Sputum showed the highest changes, in every methods
3.2. Stratified analysis showed that some methods increased some alpha diversity indices. Changes were highest at sputum. However, stratified analysis showed Benzonase was the only one showed significant changes.
A4. Mediation analysis
Mediation analysis (treatment - binary)
outcome = S.obs exposure = treatment (binary) mediator = Final_reads mediator-outcome confounders = sample_type exposure-mediator confounders = NA outcome model = Mixed effects linear regression mediator model = Mixed effects linear regression
Mediation analysis will be conducted only for samples. Sample type was considered as a mediator-outcome confounder, since it affected both S.obs and Final reads. Here, the model cannot consider there was sample-type X treatment effect Therefore none of the association was significant.
# only mediator-outcome confounders
detach_package <- function(pkg, character.only = FALSE)
{
if(!character.only)
{
pkg <- deparse(substitute(pkg))
}
search_item <- paste("package", pkg, sep = ":")
while(search_item %in% search())
{
detach(search_item, unload = TRUE, character.only = TRUE)
}
}
detach_package(lmerTest)
## all treatment groups
sample_data_respiratory <- subset(sample_data, sample_data$sample_type == "Sputum" | sample_data$sample_type == "BAL" | sample_data$sample_type == "Nasal")
med.fit <- lmer(log10(Final_reads) ~ treatment + (1|subject_id),
data = sample_data_respiratory)
out.fit <- lmer(S.obs ~ treatment * log10(Final_reads) + sample_type + (1|subject_id),
data = sample_data_respiratory)
med.out <- mediate(model.m = med.fit,
model.y = out.fit,
treat = "treatment",
mediator = "log10(Final_reads)",
sims = 1000)
out.sum <- summary(med.out)
final.out.all <- data.frame(indirect.est=out.sum$d.avg,
indirect.P.val=out.sum$d.avg.p,
direct.est=out.sum$z.avg,
direct.P.val=out.sum$z.avg.p,
total.est=out.sum$tau.coef,
total.P.val=out.sum$tau.p,
pm.est=out.sum$n.avg,
pm.P.val=out.sum$n.avg.p) %>%
mutate_all(~round(., 3))
final.out.all
Mediation analysis (treatment-stratified)
outcome = S.obs exposure = treatment (stratified) mediator = Final_reads mediator-outcome confounders = sample_type exposure-mediator confounders = NA outcome model = Mixed effects linear regression mediator model = Mixed effects linear regression
Mediation analysis was conducted stratified treatment.
## lypma
med.fit <- lmer(log10(Final_reads) ~ lypma + (1|subject_id),
data = subset(sample_data_respiratory, sample_data_respiratory$control == 1 | sample_data_respiratory$lypma == 1))
out.fit <- lmer(S.obs ~ lypma * log10(Final_reads) + sample_type + (1|subject_id),
data = subset(sample_data_respiratory, sample_data_respiratory$control == 1 | sample_data_respiratory$lypma == 1))
med.out <- mediate(model.m = med.fit,
model.y = out.fit,
treat = "lypma",
mediator = "log10(Final_reads)",
sims = 1000)
out.sum <- summary(med.out)
final.out.lypma <- data.frame(indirect.est=out.sum$d.avg,
indirect.P.val=out.sum$d.avg.p,
direct.est=out.sum$z.avg,
direct.P.val=out.sum$z.avg.p,
total.est=out.sum$tau.coef,
total.P.val=out.sum$tau.p,
pm.est=out.sum$n.avg,
pm.P.val=out.sum$n.avg.p) %>%
mutate_all(~round(., 3))
## benzonase
med.fit <- lmer(log10(Final_reads) ~ benzonase + (1|subject_id),
data = subset(sample_data_respiratory, sample_data_respiratory$control == 1 | sample_data_respiratory$benzonase == 1))
out.fit <- lmer(S.obs ~ benzonase * log10(Final_reads) + sample_type + (1|subject_id),
data = subset(sample_data_respiratory, sample_data_respiratory$control == 1 | sample_data_respiratory$benzonase == 1))
med.out <- mediate(model.m = med.fit,
model.y = out.fit,
treat = "benzonase",
mediator = "log10(Final_reads)",
sims = 1000)
out.sum <- summary(med.out)
final.out.benzonase <- data.frame(indirect.est=out.sum$d.avg,
indirect.P.val=out.sum$d.avg.p,
direct.est=out.sum$z.avg,
direct.P.val=out.sum$z.avg.p,
total.est=out.sum$tau.coef,
total.P.val=out.sum$tau.p,
pm.est=out.sum$n.avg,
pm.P.val=out.sum$n.avg.p) %>%
mutate_all(~round(., 3))
## host zero
med.fit <- lmer(log10(Final_reads) ~ host_zero + (1|subject_id),
data = subset(sample_data_respiratory, sample_data_respiratory$control == 1 | sample_data_respiratory$host_zero == 1))
out.fit <- lmer(S.obs ~ host_zero * log10(Final_reads) + sample_type + (1|subject_id),
data = subset(sample_data_respiratory, sample_data_respiratory$control == 1 | sample_data_respiratory$host_zero == 1))
med.out <- mediate(model.m = med.fit,
model.y = out.fit,
treat = "host_zero",
mediator = "log10(Final_reads)",
sims = 1000)
out.sum <- summary(med.out)
final.out.host_zero <- data.frame(indirect.est=out.sum$d.avg,
indirect.P.val=out.sum$d.avg.p,
direct.est=out.sum$z.avg,
direct.P.val=out.sum$z.avg.p,
total.est=out.sum$tau.coef,
total.P.val=out.sum$tau.p,
pm.est=out.sum$n.avg,
pm.P.val=out.sum$n.avg.p) %>%
mutate_all(~round(., 3))
## molysis
med.fit <- lmer(log10(Final_reads) ~ molysis + (1|subject_id),
data = subset(sample_data_respiratory, sample_data_respiratory$control == 1 | sample_data_respiratory$molysis == 1))
out.fit <- lmer(S.obs ~ molysis * log10(Final_reads) + sample_type + (1|subject_id),
data = subset(sample_data_respiratory, sample_data_respiratory$control == 1 | sample_data_respiratory$molysis == 1))
med.out <- mediate(model.m = med.fit,
model.y = out.fit,
treat = "molysis",
mediator = "log10(Final_reads)",
sims = 1000)
out.sum <- summary(med.out)
final.out.molysis <- data.frame(indirect.est=out.sum$d.avg,
indirect.P.val=out.sum$d.avg.p,
direct.est=out.sum$z.avg,
direct.P.val=out.sum$z.avg.p,
total.est=out.sum$tau.coef,
total.P.val=out.sum$tau.p,
pm.est=out.sum$n.avg,
pm.P.val=out.sum$n.avg.p) %>%
mutate_all(~round(., 3))
## qiaamp
med.fit <- lmer(log10(Final_reads) ~ qiaamp + (1|subject_id),
data = subset(sample_data_respiratory, sample_data_respiratory$control == 1 | sample_data_respiratory$qiaamp == 1))
out.fit <- lmer(S.obs ~ qiaamp * log10(Final_reads) + sample_type + (1|subject_id),
data = subset(sample_data_respiratory, sample_data_respiratory$control == 1 | sample_data_respiratory$qiaamp == 1))
med.out <- mediate(model.m = med.fit,
model.y = out.fit,
treat = "qiaamp",
mediator = "log10(Final_reads)",
sims = 1000)
out.sum <- summary(med.out)
final.out.qiaamp <- data.frame(indirect.est=out.sum$d.avg,
indirect.P.val=out.sum$d.avg.p,
direct.est=out.sum$z.avg,
direct.P.val=out.sum$z.avg.p,
total.est=out.sum$tau.coef,
total.P.val=out.sum$tau.p,
pm.est=out.sum$n.avg,
pm.P.val=out.sum$n.avg.p) %>%
mutate_all(~round(., 3))
rbind(final.out.lypma,
final.out.benzonase,
final.out.host_zero,
final.out.molysis,
final.out.qiaamp) %>%
mutate(treatment = c("lyPMA", "Benzonase", "Host zero", "Molysis", "QIAamp"), .before = indirect.est)
A5. Taxa beta diversity
Permanova (Taxa dist ~ log10(final reads) + sample_type + treatment + sample_type * treatment + subject_id) –> both stratified and nonstratified
Beta diversity figures
PCoA based on Horn-Morichita (all samples)
Jaccard dissimilarities (presenece and absence) showed BAL and Mock communities are distant. Some samples may have some overlaps
Changed to Horn-Morisita. It is less sensitive to changes due to sequencing depth
phyloseq_rel_nz <- transform_sample_counts(phyloseq$phyloseq_path_rpk, function(x) {x/sum(x)}) %>%
subset_samples(S.obs != 0 & sample_type %in% c("Mock", "BAL", "Nasal", "Sputum"))
horn_perm_all <- vegan::adonis2(distance(phyloseq_rel_nz, method="horn") ~ sample_type * treatment + subject_id + log10(Final_reads),
data = phyloseq_rel_nz %>% sample_data %>% data.frame(check.names = F),
permutations = 10000)
horn_perm_ns <- vegan::adonis2(distance(subset_samples(phyloseq_rel_nz, sample_type == "Nasal"), method="horn") ~ lypma + benzonase + host_zero + molysis + qiaamp + log10(Final_reads),
data = subset_samples(phyloseq_rel_nz, sample_type == "Nasal") %>%
sample_data %>% data.frame(check.names = F),
strata = subset_samples(phyloseq_rel_nz, sample_type == "Nasal") %>%
sample_data %>% data.frame(check.names = F) %>% .$subject_id, permutations = 10000)
horn_perm_bal <- vegan::adonis2(distance(subset_samples(phyloseq_rel_nz, sample_type == "BAL"), method="horn") ~ lypma + benzonase + host_zero + molysis + qiaamp + log10(Final_reads),
data = subset_samples(phyloseq_rel_nz, sample_type == "BAL") %>% sample_data %>% data.frame(check.names = F),
strata = subset_samples(phyloseq_rel_nz, sample_type == "BAL") %>%
sample_data %>% data.frame(check.names = F) %>% .$subject_id,
permutations = 10000)
horn_perm_spt <- vegan::adonis2(distance(subset_samples(phyloseq_rel_nz, sample_type == "Sputum"), method="horn") ~ lypma + benzonase + host_zero + molysis + qiaamp + log10(Final_reads),
data = subset_samples(phyloseq_rel_nz, sample_type == "Sputum") %>% sample_data %>% data.frame(check.names = F),
strata = subset_samples(phyloseq_rel_nz, sample_type == "Sputum")
%>% sample_data %>% data.frame(check.names = F) %>% .$subject_id,
permutations = 10000)
horn_perm_all %>% data.frame(check.names = F) %>% rownames_to_column("row.names") %>%
mutate(row.names = case_when(row.names == "sample_type" ~ 'Sample type',
row.names == "treatment" ~ 'Treatment',
row.names == "subject_id" ~ 'Subject',
row.names == "log10(Final_reads)" ~ 'log10(Final reads)',
row.names == "sample_type:treatment" ~ 'Sample type * Treatment',
row.names == "Residual" ~ 'Residual',
row.names == "Total" ~ 'Total')) %>% column_to_rownames('row.names') %>%
mutate(` ` = case_when(abs(`Pr(>F)`) < 0.001 ~ "***",
abs(`Pr(>F)`) < 0.01 ~ "**",
abs(`Pr(>F)`) < 0.05 ~ "*",
.default = " ")) %>%
mutate(across(is.numeric, round, digits=3)) %>%
rename("<i>p</i>-value" = "Pr(>F)",
"R<sup>2</sup>" = "R2",
"Degree of freedom" = "Df") %>%
mutate(`<i>p</i>-value` = format(`<i>p</i>-value`, nsmall = 3)) %>%
select(c("Degree of freedom", "R<sup>2</sup>", "<i>p</i>-value", " ")) %>%
kbl(format = "html", escape = 0) %>%
kable_styling(full_width = 0, html_font = "serif")
| Degree of freedom | R2 | p-value | ||
|---|---|---|---|---|
| Sample type | 3 | 0.556 | 0.000 | *** |
| Treatment | 5 | 0.069 | 0.000 | *** |
| Subject | 17 | 0.252 | 0.000 | *** |
| log10(Final reads) | 1 | 0.036 | 0.000 | *** |
| Sample type * Treatment | 15 | 0.033 | 0.006 | ** |
| Residual | 81 | 0.054 | NA | |
| Total | 122 | 1.000 | NA |
PERMANOVA - Morisita-Horn (stratified)
Distances between samples within each subject. Mean distance between control <-> treatment for each subject
a <- horn_perm_bal %>% data.frame(check.names = F) %>% rownames_to_column('row.names') %>%
mutate(row.names = case_when(row.names == "lypma" ~ 'lyPMA',
row.names == "benzonase" ~ 'Benzonase',
row.names == "host_zero" ~ 'Host zero',
row.names == "molysis" ~ 'Molysis',
row.names == "qiaamp" ~ 'QIAamp',
row.names == "subject_id" ~ 'Subject id',
row.names == "log10(Final_reads)" ~ 'log10(Final reads)',
row.names == "Residual" ~ 'Residual',
row.names == "Total" ~ 'Total')) %>% column_to_rownames('row.names') %>%
mutate(` ` = case_when(abs(`Pr(>F)`) < 0.001 ~ "***",
abs(`Pr(>F)`) < 0.01 ~ "**",
abs(`Pr(>F)`) < 0.05 ~ "*",
.default = " ")) %>%
mutate(across(is.numeric, round, digits=3)) %>%
rename("<i>p</i>-value" = "Pr(>F)",
"R<sup>2</sup>" = "R2",
"Degree of freedom" = "Df") %>%
select(c("R<sup>2</sup>", "<i>p</i>-value", " "))
b <- horn_perm_ns %>% data.frame(check.names = F) %>% rownames_to_column('row.names') %>%
mutate(row.names = case_when(row.names == "lypma" ~ 'lyPMA',
row.names == "benzonase" ~ 'Benzonase',
row.names == "host_zero" ~ 'Host zero',
row.names == "molysis" ~ 'Molysis',
row.names == "qiaamp" ~ 'QIAamp',
row.names == "subject_id" ~ 'Subject id',
row.names == "log10(Final_reads)" ~ 'log10(Final reads)',
row.names == "Residual" ~ 'Residual',
row.names == "Total" ~ 'Total')) %>% column_to_rownames('row.names') %>%
mutate(` ` = case_when(abs(`Pr(>F)`) < 0.001 ~ "***",
abs(`Pr(>F)`) < 0.01 ~ "**",
abs(`Pr(>F)`) < 0.05 ~ "*",
.default = " ")) %>%
mutate(across(is.numeric, round, digits=3)) %>%
rename("<i>p</i>-value" = "Pr(>F)",
"R<sup>2</sup>" = "R2",
"Degree of freedom" = "Df") %>%
select(c("R<sup>2</sup>", "<i>p</i>-value", " "))
c <- horn_perm_spt %>% data.frame(check.names = F) %>% rownames_to_column('row.names') %>%
mutate(row.names = case_when(row.names == "lypma" ~ 'lyPMA',
row.names == "benzonase" ~ 'Benzonase',
row.names == "host_zero" ~ 'Host zero',
row.names == "molysis" ~ 'Molysis',
row.names == "qiaamp" ~ 'QIAamp',
row.names == "subject_id" ~ 'Subject id',
row.names == "log10(Final_reads)" ~ 'log10(Final reads)',
row.names == "Residual" ~ 'Residual',
row.names == "Total" ~ 'Total')) %>% column_to_rownames('row.names') %>%
mutate(` ` = case_when(abs(`Pr(>F)`) < 0.001 ~ "***",
abs(`Pr(>F)`) < 0.01 ~ "**",
abs(`Pr(>F)`) < 0.05 ~ "*",
.default = " ")) %>%
mutate(across(is.numeric, round, digits=3)) %>%
rename("<i>p</i>-value" = "Pr(>F)",
"R<sup>2</sup>" = "R2",
"Degree of freedom" = "Df") %>%
select(c("R<sup>2</sup>", "<i>p</i>-value", " "))
cbind(a, b, c) %>%
kbl(format = "html", escape = 0) %>%
add_header_above(c(" " = 1, "BAL" = 3, "Nasal swab" = 3,"Sputum"= 3)) %>%
kable_styling(full_width = 0, html_font = "serif")
| R2 | p-value | R2 | p-value | R2 | p-value | ||||
|---|---|---|---|---|---|---|---|---|---|
| lyPMA | -0.034 | 0.962 | -0.032 | 0.993 | -0.083 | 1 | |||
| Benzonase | 0.069 | 0.081 | 0.002 | 0.629 | -0.062 | 1 | |||
| Host zero | 0.017 | 0.292 | 0.042 | 0.245 | 0.120 | 0 | *** | ||
| Molysis | 0.125 | 0.033 |
|
0.084 | 0.088 | 0.258 | 0 | *** | |
| QIAamp | -0.012 | 0.896 | 0.357 | 0.000 | *** | 0.687 | 0 | *** | |
| log10(Final reads) | 0.414 | 0.091 | 0.142 | 0.023 |
|
0.066 | 0 | *** | |
| Residual | 0.422 | NA | 0.405 | NA | 0.014 | NA | |||
| Total | 1.000 | NA | 1.000 | NA | 1.000 | NA |
Beta diversity boxplot (Horn-Morisita)
Distances between samples within each subject. Mean distance between control <-> treatment for each subject
#distances of betadiversity - boxplots
horn_dist_long <- distance(phyloseq_rel_nz, method="horn") %>% as.matrix() %>% melt_dist() #making long data of distance matrices
#Adding sample type and treatment name.
#this can be also done by merging metadata into the `horn_dist_long`
names <- data.frame(str_split_fixed(horn_dist_long$iso1, "_", 3))
names2 <- data.frame(str_split_fixed(horn_dist_long$iso2, "_", 3))
horn_dist_long$sample_id_1 <- paste(names$X1, names$X2, sep = "_")
horn_dist_long$method_1 <- ifelse(grepl("lyPMA", horn_dist_long$iso1),"lypma",
ifelse(grepl("ben", horn_dist_long$iso1),"benzonase",
ifelse(grepl("host", horn_dist_long$iso1),"host_zero",
ifelse(grepl("qia", horn_dist_long$iso1),"qiaamp",
ifelse(grepl("moly", horn_dist_long$iso1),"molysis",
"control")))))
#Adding data for iso 2 also should be done
horn_dist_long$sample_id_2 <- paste(names2$X1, names2$X2, sep = "_")
horn_dist_long$method_2 <-ifelse(grepl("lyPMA", horn_dist_long$iso2),"lypma",
ifelse(grepl("ben", horn_dist_long$iso2),"benzonase",
ifelse(grepl("host", horn_dist_long$iso2),"host_zero",
ifelse(grepl("qia", horn_dist_long$iso2),"qiaamp",
ifelse(grepl("moly", horn_dist_long$iso2),"molysis",
"control")))))
#subsetting distances of my interest
horn_dist_long$sample_id_1 <- ifelse(grepl("pos", horn_dist_long$sample_id_1, ignore.case = T),"Mock",
ifelse(grepl("neg|n_", horn_dist_long$sample_id_1, ignore.case = T),"Neg.",
horn_dist_long$sample_id_1))
horn_dist_long$sample_id_2 <- ifelse(grepl("pos", horn_dist_long$sample_id_2, ignore.case = T),"Mock",
ifelse(grepl("neg|n_", horn_dist_long$sample_id_2, ignore.case = T),"Neg.",
horn_dist_long$sample_id_2))
path_horn_dist_long_within_sampleid_from_control <- subset(horn_dist_long, horn_dist_long$sample_id_1 == horn_dist_long$sample_id_2) # data within samples
path_horn_dist_long_within_sampleid_from_control <- subset(path_horn_dist_long_within_sampleid_from_control,
path_horn_dist_long_within_sampleid_from_control$method_1 != path_horn_dist_long_within_sampleid_from_control$method_2) # remove irrelevant association
path_horn_dist_long_within_sampleid_from_control <- subset(path_horn_dist_long_within_sampleid_from_control, (path_horn_dist_long_within_sampleid_from_control$method_1 == "control") + (path_horn_dist_long_within_sampleid_from_control$method_2 == "control") != 0)
path_horn_dist_long_within_sampleid_from_control
path_horn_dist_long_within_sampleid_from_control$treatment <- path_horn_dist_long_within_sampleid_from_control$method_1
path_horn_dist_long_within_sampleid_from_control$treatment <- ifelse(path_horn_dist_long_within_sampleid_from_control$treatment == "control", path_horn_dist_long_within_sampleid_from_control$method_2, path_horn_dist_long_within_sampleid_from_control$treatment)
#Setting key method
path_horn_dist_long_within_sampleid_from_control$sample_type <- ifelse(grepl("NS", path_horn_dist_long_within_sampleid_from_control$iso1), "Nasal",
ifelse(grepl("CFB", path_horn_dist_long_within_sampleid_from_control$iso1), "Sputum",
ifelse(grepl("BAL", path_horn_dist_long_within_sampleid_from_control$iso1), "BAL",
ifelse(grepl("pos", path_horn_dist_long_within_sampleid_from_control$iso1, ignore.case = T), "Mock",
ifelse(grepl("neg|N_EXT", path_horn_dist_long_within_sampleid_from_control$iso1), "Neg.",NA)))))
#Making a column for baseline (controls, from where?)
path_horn_dist_long_within_sampleid_from_control <- path_horn_dist_long_within_sampleid_from_control %>%
mutate(dist_from = case_when(method_1 == "control" ~ iso1,
method_2 == "control" ~ iso2))
dummy <- data.frame(iso1 = path_horn_dist_long_within_sampleid_from_control$dist_from %>% unique,
iso2 = path_horn_dist_long_within_sampleid_from_control$dist_from %>% unique,
dist = 0,
treatment = "Untreated",
method_1 = "control",
method_2 = "control"
)
names <- data.frame(str_split_fixed(dummy$iso1, "_", 3))
names2 <- data.frame(str_split_fixed(dummy$iso2, "_", 3))
dummy$sample_id_1 <- paste(names$X1, names$X2, sep = "_")
#Adding data for iso 2 also should be done
dummy$sample_id_2 <- paste(names2$X1, names2$X2, sep = "_")
#subsetting distances of my interest
dummy$sample_id_1 <- ifelse(grepl("pos", dummy$sample_id_1, ignore.case = T),"Mock",
ifelse(grepl("neg|n_", dummy$sample_id_1, ignore.case = T),"Neg.",
dummy$sample_id_1))
dummy$sample_id_2 <- ifelse(grepl("pos", dummy$sample_id_2, ignore.case = T),"Mock",
ifelse(grepl("neg|n_", dummy$sample_id_2, ignore.case = T),"Neg.",
dummy$sample_id_2))
dummy$sample_type <- ifelse(grepl("NS", dummy$iso1), "Nasal",
ifelse(grepl("CFB", dummy$iso1), "Sputum",
ifelse(grepl("BAL", dummy$iso1), "BAL",
ifelse(grepl("pos|POS", dummy$iso1, ignore.case = T), "Mock",
ifelse(grepl("neg|N_EXT", dummy$iso1), "Neg.",NA)))))
dummy <- subset(dummy, !is.na(dummy$sample_type))
path_horn_dist_long_within_sampleid_from_control <- bind_rows(path_horn_dist_long_within_sampleid_from_control, dummy)
path_horn_dist_long_within_sampleid_from_control %>%
mutate(across(sample_type, factor, levels=c("Mock", "BAL", "Nasal","Sputum")),
across(treatment, factor, levels=c("Untreated", "lypma", "benzonase", "host_zero","molysis", "qiaamp"))) %>%
ggplot(aes(y = dist, fill = treatment)) +
geom_boxplot() +
#scale_fill_manual(values = c(viridis(6)[2:6])) +
scale_fill_manual(values = c("#e31a1c", "#fb9a99", "#33a02c", "#b2df8a", "#1f78b4", "#a6cee3"), name = "Treatment", labels = c("Untreated", "lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
ylab("Sample pair distances (Horn-Morisita)") +
theme_classic (base_size = 12, base_family = "serif") +
theme(plot.tag = element_text(size = 15), axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
facet_wrap(~sample_type, ncol = 5)
LME on H-M distance
Effect size, standard error (SE) and p-value of a statistical test for Bray-Curtis distance from untreated to each treated within subject (BC-distance wihtin subject by treatment~ sample type + treatment + sampetype * treatment + log10 (Final_reads) + (1|subject_id) ).
Hard to interpret, as it does not have proper baseline (control is not there as control does not have distances)
library(lmerTest)
path_horn_dist_long_within_sampleid_from_control$treatment <- factor(path_horn_dist_long_within_sampleid_from_control$treatment,
levels = c("Untreated",
"lypma",
"benzonase",
"host_zero",
"molysis",
"qiaamp"))
path_horn_dist_long_within_sampleid_from_control$sample_type <- factor(path_horn_dist_long_within_sampleid_from_control$sample_type,
levels = c("Mock",
"BAL",
"Nasal",
"Sputum"))
tableS9 <- lmer(dist ~ sample_type * treatment + (1|sample_id_1), data = path_horn_dist_long_within_sampleid_from_control %>% subset(., .$sample_type != "Neg.")) %>% summary %>% .$ coefficients %>% data.frame(check.names = F) %>% rownames_to_column('row.names') %>%
mutate(row.names = gsub("treatment|sample_type", "", row.names)) %>%
mutate(row.names = gsub("[:]", " * ", row.names)) %>%
mutate(row.names = gsub("host_zero", "Host zero", row.names)) %>%
mutate(row.names = gsub("benzonase", "Benzonase", row.names)) %>%
mutate(row.names = gsub("lypma", "lyPMA", row.names)) %>%
mutate(row.names = gsub("molysis", "Molysis", row.names)) %>%
mutate(row.names = gsub("qiaamp", "QIAamp", row.names)) %>%
mutate(across(is.numeric, round, digits=3)) %>%
column_to_rownames("row.names") %>%
mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.001 ~ "***",
abs(`Pr(>|t|)`) < 0.01 ~ "**",
abs(`Pr(>|t|)`) < 0.05 ~ "*",
.default = " ")) %>%
rename("<i>p</i>-value" = "Pr(>|t|)",
SE = "Std. Error") %>%
mutate("Effect size (95% CI)" = paste(round(Estimate, 2) %>% format(nsmall = 2),
" (",
round(Estimate - 1.96 * SE, 2) %>% format(nsmall = 2),
", ",
round(Estimate + 1.96 * SE, 2) %>% format(nsmall = 2),
")",
sep = ""),
"<i>p</i>-value" = round(`<i>p</i>-value`, 3)) %>%
select(c("Effect size (95% CI)", "<i>p</i>-value", " ")) %>%
kbl(format = "html", escape = 0) %>%
kable_styling(full_width = 0, html_font = "serif")
tableS9
| Effect size (95% CI) | p-value | ||
|---|---|---|---|
| (Intercept) | 0.00 (-0.15, 0.15) | 1.000 | |
| BAL | 0.00 (-0.17, 0.17) | 1.000 | |
| Nasal | 0.00 (-0.15, 0.15) | 1.000 | |
| Sputum | 0.00 (-0.16, 0.16) | 1.000 | |
| lyPMA | 0.04 ( 0.00, 0.08) | 0.032 |
|
| Benzonase | 0.00 (-0.04, 0.04) | 0.914 | |
| Host zero | 0.00 (-0.04, 0.04) | 0.902 | |
| Molysis | 0.00 (-0.04, 0.04) | 0.913 | |
| QIAamp | 0.00 (-0.03, 0.04) | 0.891 | |
| BAL * lyPMA | 0.17 ( 0.09, 0.25) | 0.000 | *** |
| Nasal * lyPMA | -0.03 (-0.09, 0.03) | 0.329 | |
| Sputum * lyPMA | 0.08 ( 0.01, 0.15) | 0.020 |
|
| BAL * Benzonase | 0.04 (-0.04, 0.11) | 0.391 | |
| Nasal * Benzonase | 0.01 (-0.05, 0.08) | 0.649 | |
| Sputum * Benzonase | 0.18 ( 0.11, 0.24) | 0.000 | *** |
| BAL * Host zero | 0.18 ( 0.10, 0.26) | 0.000 | *** |
| Nasal * Host zero | 0.05 (-0.01, 0.11) | 0.131 | |
| Sputum * Host zero | 0.26 ( 0.19, 0.32) | 0.000 | *** |
| BAL * Molysis | 0.18 ( 0.10, 0.26) | 0.000 | *** |
| Nasal * Molysis | 0.05 (-0.01, 0.11) | 0.135 | |
| Sputum * Molysis | 0.26 ( 0.19, 0.33) | 0.000 | *** |
| BAL * QIAamp | 0.10 ( 0.02, 0.18) | 0.015 |
|
| Nasal * QIAamp | 0.02 (-0.04, 0.08) | 0.538 | |
| Sputum * QIAamp | 0.25 ( 0.18, 0.32) | 0.000 | *** |
LME on H-M distance (stratified)
path_horn_dist_long_within_sampleid_from_control$treatment <- factor(path_horn_dist_long_within_sampleid_from_control$treatment,
levels = c("Untreated",
"lypma",
"benzonase",
"host_zero",
"molysis",
"qiaamp"))
path_horn_dist_long_within_sampleid_from_control$sample_type <- factor(path_horn_dist_long_within_sampleid_from_control$sample_type,
levels = c("BAL",
"Nasal",
"Sputum"))
a <- lm(dist ~ treatment, data = path_horn_dist_long_within_sampleid_from_control %>% subset(., .$sample_type == "BAL")) %>% summary %>% .$ coefficients %>% data.frame(check.names = F) %>% rownames_to_column('row.names') %>%
mutate(row.names = gsub("host_zero", "Host zero", row.names)) %>%
mutate(row.names = gsub("benzonase", "Benzonase", row.names)) %>%
mutate(row.names = gsub("lypma", "lyPMA", row.names)) %>%
mutate(row.names = gsub("molysis", "Molysis", row.names)) %>%
mutate(row.names = gsub("qiaamp", "QIAamp", row.names)) %>%
mutate(across(is.numeric, round, digits=3)) %>%
mutate(`row.names` = gsub("treatment|sample_type", "", `row.names`)) %>%
column_to_rownames("row.names") %>%
mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.001 ~ "***",
abs(`Pr(>|t|)`) < 0.01 ~ "**",
abs(`Pr(>|t|)`) < 0.05 ~ "*",
.default = " ")) %>%
rename("<i>p</i>-value" = "Pr(>|t|)",
SE = "Std. Error") %>%
mutate("Effect size (95% CI)" = paste(round(Estimate, 2) %>% format(nsmall = 2),
" (",
round(Estimate - 1.96 * SE, 2) %>% format(nsmall = 2),
", ",
round(Estimate + 1.96 * SE, 2) %>% format(nsmall = 2),
")",
sep = ""),
"<i>p</i>-value" = round(`<i>p</i>-value`, 3)) %>%
select(c("Effect size (95% CI)", "<i>p</i>-value", " "))
b <- lm(dist ~ treatment, data = path_horn_dist_long_within_sampleid_from_control %>% subset(., .$sample_type == "Nasal")) %>% summary %>% .$ coefficients %>% data.frame(check.names = F) %>% rownames_to_column('row.names') %>%
mutate(row.names = gsub("host_zero", "Host zero", row.names)) %>%
mutate(row.names = gsub("benzonase", "Benzonase", row.names)) %>%
mutate(row.names = gsub("lypma", "lyPMA", row.names)) %>%
mutate(row.names = gsub("molysis", "Molysis", row.names)) %>%
mutate(row.names = gsub("qiaamp", "QIAamp", row.names)) %>%
mutate(across(is.numeric, round, digits=3)) %>%
mutate(`row.names` = gsub("treatment|sample_type", "", `row.names`)) %>%
column_to_rownames("row.names") %>%
mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.001 ~ "***",
abs(`Pr(>|t|)`) < 0.01 ~ "**",
abs(`Pr(>|t|)`) < 0.05 ~ "*",
.default = " ")) %>%
rename("<i>p</i>-value" = "Pr(>|t|)",
SE = "Std. Error") %>%
mutate("Effect size (95% CI)" = paste(round(Estimate, 2) %>% format(nsmall = 2),
" (",
round(Estimate - 1.96 * SE, 2) %>% format(nsmall = 2),
", ",
round(Estimate + 1.96 * SE, 2) %>% format(nsmall = 2),
")",
sep = ""),
"<i>p</i>-value" = round(`<i>p</i>-value`, 3)) %>%
select(c("Effect size (95% CI)", "<i>p</i>-value", " "))
c <- lm(dist ~ treatment, data = path_horn_dist_long_within_sampleid_from_control %>% subset(., .$sample_type == "Sputum")) %>% summary %>% .$ coefficients %>% data.frame(check.names = F) %>% rownames_to_column('row.names') %>%
mutate(row.names = gsub("host_zero", "Host zero", row.names)) %>%
mutate(row.names = gsub("benzonase", "Benzonase", row.names)) %>%
mutate(row.names = gsub("lypma", "lyPMA", row.names)) %>%
mutate(row.names = gsub("molysis", "Molysis", row.names)) %>%
mutate(row.names = gsub("qiaamp", "QIAamp", row.names)) %>%
mutate(across(is.numeric, round, digits=3)) %>%
mutate(`row.names` = gsub("treatment|sample_type", "", `row.names`)) %>%
column_to_rownames("row.names") %>%
mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.001 ~ "***",
abs(`Pr(>|t|)`) < 0.01 ~ "**",
abs(`Pr(>|t|)`) < 0.05 ~ "*",
.default = " ")) %>%
rename("<i>p</i>-value" = "Pr(>|t|)",
SE = "Std. Error") %>%
mutate("Effect size (95% CI)" = paste(round(Estimate, 2) %>% format(nsmall = 2),
" (",
round(Estimate - 1.96 * SE, 2) %>% format(nsmall = 2),
", ",
round(Estimate + 1.96 * SE, 2) %>% format(nsmall = 2),
")",
sep = ""),
"<i>p</i>-value" = round(`<i>p</i>-value`, 3)) %>%
select(c("Effect size (95% CI)", "<i>p</i>-value", " "))
cbind(a, b, c) %>%
kbl(format = "html", escape = 0) %>%
add_header_above(c(" " = 1, "BAL" = 3, "Nasal swab" = 3,"Sputum"= 3)) %>%
kable_styling(full_width = 0, html_font = "serif")
| Effect size (95% CI) | p-value | Effect size (95% CI) | p-value | Effect size (95% CI) | p-value | ||||
|---|---|---|---|---|---|---|---|---|---|
| (Intercept) | 0.00 (-0.14, 0.14) | 1.000 | 0.00 (-0.03, 0.03) | 1.000 | 0.00 (-0.12, 0.12) | 1.000 | |||
| lyPMA | 0.21 ( 0.02, 0.41) | 0.055 | 0.01 (-0.04, 0.05) | 0.776 | 0.12 (-0.05, 0.29) | 0.182 | |||
| Benzonase | 0.04 (-0.16, 0.23) | 0.717 | 0.02 (-0.02, 0.07) | 0.317 | 0.18 ( 0.01, 0.35) | 0.055 | |||
| Host zero | 0.18 (-0.01, 0.38) | 0.092 | 0.06 ( 0.01, 0.10) | 0.017 |
|
0.26 ( 0.09, 0.43) | 0.007 | ** | |
| Molysis | 0.18 (-0.01, 0.38) | 0.088 | 0.05 ( 0.01, 0.10) | 0.024 |
|
0.26 ( 0.09, 0.43) | 0.007 | ** | |
| QIAamp | 0.10 (-0.09, 0.30) | 0.327 | 0.02 (-0.03, 0.06) | 0.488 | 0.25 ( 0.08, 0.43) | 0.008 | ** |
A6. DA for taxa
DA analysis was conducted for taxa
DA taxa
Factors affecting DA taxa (main text)
filt_maaslin_all <- read.csv("data/filt_maaslin_all.csv")
filt_maaslin_interaction <- read.csv("data/filt_maaslin_interaction.csv")
filt_fit_data_bal <- read.csv("data/filt_fit_data_bal.csv")
filt_fit_data_spt <- read.csv("data/filt_fit_data_spt.csv")
filt_fit_data_ns <- read.csv("data/filt_fit_data_ns.csv")
filt_fit_data_pos <- read.csv("data/filt_fit_data_pos.csv")
cat("Factors affecting DA taxa (q<0.1)")
## Factors affecting DA taxa (q<0.1)
filt_maaslin_all %>% subset(., .$qval < 0.1 ) %>% .$metadata %>% table
## .
## benzonase host_zero log10.Final_reads lypma
## 34 19 85 31
## molysis qiaamp sample_type
## 31 22 118
Volcano plot
Volcano plot of sequencing depth adjusted differential abundance of taxa by each treatment
#Making significance table for figure
# Define a function to make species names italicized
# Make a significance table for each figure (top 20 taxa)
species_italic <- function(data) {
names <- gsub("_", " ", rownames(data))
names <- gsub("[]]|[[]", "", names)
names <- gsub(" sp", " sp.", names)
names <- gsub(" sp.", "* sp.", names)
names <- gsub(" group", "* group.", names)
names <- ifelse(grepl("[*]", names), paste("*", names, sep = ""), paste("*", names, "*", sep = ""))
rownames(data) <- names
data
}
make_sig_table <- function(data) {
sig_data <- spread(data[order(data$qval), c("feature", "metadata", "qval")], metadata, qval)
sig_data$min <- apply(sig_data, 1, FUN = min)
sig_data <- sig_data[order(sig_data$min),] %>% select("feature", "lypma", "benzonase", "host_zero", "molysis", "qiaamp") %>% .[1:20,]
sig_data[["feature"]] <- ifelse(sig_data[["feature"]] == "X.Collinsella._massiliensis", "[Collinsella]_massiliensis", sig_data[["feature"]])
sig_data_italic <- sig_data %>% rownames_to_column(var = "-") %>%
column_to_rownames(var = "feature") %>% species_italic %>% select(-c("-")) %>%
rename(lyPMA = lypma, Benzonase = benzonase, `Host zero` = host_zero, Molysis = molysis, QIAamp = qiaamp)
sig_data_sig <- ifelse(sig_data_italic < 0.1, "*", NA) %>% data.frame(check.names = F)
return(list(data = sig_data, data_italic = sig_data_italic, data_sig = sig_data_sig))
}
filt_fit_data_pos <- make_sig_table(filt_fit_data_pos)
filt_fit_data_bal <- make_sig_table(filt_fit_data_bal)
filt_fit_data_ns <- make_sig_table(filt_fit_data_ns)
filt_fit_data_spt <- make_sig_table(filt_fit_data_spt)
filt_pos_sig <- subset_taxa(subset_samples(phyloseq_unfiltered$phyloseq_rel, sample_type == "Mock"),
taxa_names(subset_samples(phyloseq_unfiltered$phyloseq_rel, sample_type == "Mock")) %in% filt_fit_data_pos$data$feature)
filt_fit_data_pos$rel <- cbind(filt_pos_sig %>% otu_table %>% t, filt_pos_sig %>% sample_data) %>% group_by(treatment) %>% summarise_if(is.numeric, mean, na.rm = TRUE) %>% .[, 1:21] %>% column_to_rownames(., "treatment") %>% t () %>% species_italic() %>% data.frame(check.names = F) %>%
.[row.names(filt_fit_data_pos$data_italic),] %>% mutate_all(~na_if(., 0)) %>% rownames_to_column("feature")
filt_spt_sig <- subset_taxa(subset_samples(phyloseq_unfiltered$phyloseq_rel, sample_type == "Sputum"), taxa_names(subset_samples(phyloseq_unfiltered$phyloseq_rel, sample_type == "Sputum")) %in% filt_fit_data_spt$data$feature)
filt_fit_data_spt$rel <- cbind(filt_spt_sig %>% otu_table %>% t, filt_spt_sig %>% sample_data) %>% group_by(treatment) %>% summarise_if(is.numeric, mean, na.rm = TRUE) %>% .[, 1:21] %>% column_to_rownames(., "treatment") %>% t () %>% species_italic() %>% data.frame(check.names = F) %>%
.[row.names(filt_fit_data_spt$data_italic),] %>% mutate_all(~na_if(., 0)) %>% rownames_to_column("feature")
filt_ns_sig <- subset_taxa(subset_samples(phyloseq_unfiltered$phyloseq_rel, sample_type == "Nasal"),
taxa_names(subset_samples(phyloseq_unfiltered$phyloseq_rel, sample_type == "Nasal")) %in% filt_fit_data_ns$data$feature)
filt_fit_data_ns$rel <- cbind(filt_ns_sig %>% otu_table %>% t, filt_ns_sig %>% sample_data) %>% group_by(treatment) %>% summarise_if(is.numeric, mean, na.rm = TRUE) %>% .[, 1:21] %>% column_to_rownames(., "treatment") %>% t () %>% species_italic() %>% data.frame(check.names = F) %>%
.[row.names(filt_fit_data_ns$data_italic),] %>% mutate_all(~na_if(., 0)) %>% rownames_to_column("feature")
filt_bal_sig <- subset_taxa(subset_samples(phyloseq_unfiltered$phyloseq_rel, sample_type == "BAL"),
taxa_names(subset_samples(phyloseq_unfiltered$phyloseq_rel, sample_type == "BAL")) %in% filt_fit_data_bal$data$feature)
filt_fit_data_bal$rel <- cbind(filt_bal_sig %>% otu_table %>% t, filt_bal_sig %>% sample_data) %>% group_by(treatment) %>% summarise_if(is.numeric, mean, na.rm = TRUE) %>% .[, 1:21] %>% column_to_rownames(., "treatment") %>% t () %>% species_italic() %>% data.frame(check.names = F) %>%
.[row.names(filt_fit_data_bal$data_italic),] %>%
mutate_all(~na_if(., 0)) %>% rownames_to_column("feature") %>% subset(., !grepl("NA", .$feature))
#Volcano plot
ggplot(filt_maaslin_all, aes(y = -log10(qval), x = coef, col = metadata)) +
theme_classic(base_family = "serif") +
#labs(tag = "A") +
geom_point(size = 2) +
xlab("MaAslin coefficient") +
ylab("-log<sub>10</sub>(*q*-value)") +
ylim(c(-1, 35)) +
geom_hline(yintercept = 1, col = "gray") +
geom_vline(xintercept = 0, col = "gray") +
annotate(family = "serif",
geom='richtext',
x=0, y=80,
label = "<i>q</i>-value = 0.1, fold-change = 0") +
theme(legend.position = "top", axis.title.y = ggtext::element_markdown(), legend.text = element_markdown()) +
scale_color_manual(values = c("#4daf4a", "#984ea3", "#f781bf", "#377eb8", "#ff7f00", "#ffff33", "#a65628"),
breaks = c("log10.Final_reads", "sample_type", "lypma", "benzonase", "host_zero", "molysis", "qiaamp"),
labels = c("log<sub>10</sub>(Final reads)", "Sample type", "lyPMA", "Benzonase", "Host zero", "Molysis", "QIAamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
guides(col = guide_legend(title = "Covariates", title.position = "top", nrow = 1))
Balloon plot
Mean relative abundance of top 20 significant taxa identified by differential abundance analysis by sample type. (A) Mock community, (B) bronchoalveolor lavage, (C) nasal swabs, and (D) sputum.
f5a <- merge(filt_fit_data_pos$rel %>%
gather(treatment,
value,
Untreated:QIAamp,
factor_key=TRUE),
filt_fit_data_pos$data_italic %>%
rownames_to_column("feature") %>%
gather(treatment,
qval,
lyPMA:QIAamp,
factor_key=TRUE),
by.x = c('feature', 'treatment'),
by.y = c('feature', 'treatment'),
all = T) %>%
merge(filt_fit_data_pos$data_sig %>%
rownames_to_column("feature") %>%
gather(treatment,
sig,
lyPMA:QIAamp,
factor_key=TRUE),
by.x = c('feature', 'treatment'),
by.y = c('feature', 'treatment'),
all = T) %>%
mutate(sig = case_when(sig < 0.1 ~ "< 0.1",
.default = "> 0.1"),
feature = case_when(feature =="*Saccharomyces cerevisiae x Saccharomyces kudriavzevii*" ~ "*S. cerevisiae* X *S. kudriavzevii*",
.default = feature)) %>%
#Baloon plot
ggballoonplot(size = "value", y = "feature", x= "treatment", fill = "sig") +
theme_classic(base_family = "serif") +
#colors for qvalues
gradient_fill(c("#006d2c", "#edf8fb")) +
xlab("Experimental group") +
ylab("Species") +
labs(tag = "A") +
theme(panel.grid.major = element_line(colour = "grey"),
legend.position = "top",
axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
#Element markdown for taxa name italicizing
axis.text.y = ggtext::element_markdown(size = 7)) +
#Adding significance asterisks
scale_fill_manual(values = c("red", "grey"), aes(y = feature,
x = treatment,
label = sig)) +
guides(fill = guide_legend(title = c(expression(paste(italic("q"),
"-value",
sep = ""))),
title.position = "top"),
size = guide_legend(title = "Relative abundance",
title.position = "top",
order = 1,
nrow = 1),
) +
scale_x_discrete(labels=c("control" = "Untreated",
"lypma" = "lyPMA",
"benzonase" = "Benzonase",
"host_zero" = "Host-zero",
"molysis" = "Molysis",
"qiaamp" = "QIAamp")
)
#ffff33 qia
f5b <- merge(filt_fit_data_bal$rel %>%
gather(treatment,
value,
Untreated:QIAamp,
factor_key=TRUE),
filt_fit_data_bal$data_italic %>%
rownames_to_column("feature") %>%
gather(treatment,
qval,
lyPMA:QIAamp,
factor_key=TRUE),
by.x = c('feature', 'treatment'),
by.y = c('feature', 'treatment'),
all = T) %>%
merge(filt_fit_data_bal$data_sig %>%
rownames_to_column("feature") %>%
gather(treatment,
sig,
lyPMA:QIAamp,
factor_key=TRUE),
by.x = c('feature', 'treatment'),
by.y = c('feature', 'treatment'),
all = T) %>%
mutate(sig = case_when(is.na(qval) ~ "> 0.1",
qval < 0.1 ~ "< 0.1",
.default = "> 0.1")) %>%
#Baloon plot
ggballoonplot(size = "value", y = "feature", x= "treatment", fill = "sig") +
theme_classic(base_family = "serif") +
#colors for qvalues
xlab("Experimental group") +
ylab("Species") +
labs(tag = "B") +
theme(panel.grid.major = element_line(colour = "grey"),
legend.position = "top",
axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
#Element markdown for taxa name italicizing
axis.text.y = ggtext::element_markdown(size = 7)) +
scale_fill_manual(values = c("grey", "red"), aes(y = feature,
x = treatment,
label = sig)) +
guides(fill = guide_legend(title = c(expression(paste(italic("q"),
"-value",
sep = ""))),
title.position = "top"),
size = guide_legend(title = "Relative abundance",
title.position = "top",
order = 1,
nrow = 1),
) +
scale_x_discrete(labels=c("control" = "Untreated",
"lypma" = "lyPMA",
"benzonase" = "Benzonase",
"host_zero" = "Host-zero",
"molysis" = "Molysis",
"qiaamp" = "QIAamp")
)
f5c <- merge(filt_fit_data_ns$rel %>%
gather(treatment,
value,
Untreated:QIAamp,
factor_key=TRUE),
filt_fit_data_ns$data_italic %>%
rownames_to_column("feature") %>%
gather(treatment,
qval,
lyPMA:QIAamp,
factor_key=TRUE),
by.x = c('feature', 'treatment'),
by.y = c('feature', 'treatment'),
all = T) %>%
merge(filt_fit_data_ns$data_sig %>%
rownames_to_column("feature") %>%
gather(treatment,
sig,
lyPMA:QIAamp,
factor_key=TRUE),
by.x = c('feature', 'treatment'),
by.y = c('feature', 'treatment'),
all = T) %>%
mutate(sig = case_when(sig < 0.1 ~ "< 0.1",
.default = "> 0.1")) %>%
#Baloon plot
ggballoonplot(size = "value", y = "feature", x= "treatment", fill = "sig") +
theme_classic(base_family = "serif") +
#colors for qvalues
gradient_fill(c("#006d2c", "#edf8fb")) +
xlab("Experimental group") +
ylab("Species") +
labs(tag = "C") +
theme(panel.grid.major = element_line(colour = "grey"),
legend.position = "top",
axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
#Element markdown for taxa name italicizing
axis.text.y = ggtext::element_markdown(size = 7)) +
#Adding significance asterisks
scale_fill_manual(values = c("red", "grey"), aes(y = feature,
x = treatment,
label = sig)) +
guides(fill = guide_legend(title = c(expression(paste(italic("q"),
"-value",
sep = ""))),
title.position = "top"),
size = guide_legend(title = "Relative abundance",
title.position = "top",
order = 1,
nrow = 1),
) +
scale_x_discrete(labels=c("control" = "Untreated",
"lypma" = "lyPMA",
"benzonase" = "Benzonase",
"host_zero" = "Host-zero",
"molysis" = "Molysis",
"qiaamp" = "QIAamp")
)
f5d <- merge(filt_fit_data_spt$rel %>%
gather(treatment,
value,
Untreated:QIAamp,
factor_key=TRUE),
filt_fit_data_spt$data_italic %>%
rownames_to_column("feature") %>%
gather(treatment,
qval,
lyPMA:QIAamp,
factor_key=TRUE),
by.x = c('feature', 'treatment'),
by.y = c('feature', 'treatment'),
all = T) %>%
merge(filt_fit_data_spt$data_sig %>%
rownames_to_column("feature") %>%
gather(treatment,
sig,
lyPMA:QIAamp,
factor_key=TRUE),
by.x = c('feature', 'treatment'),
by.y = c('feature', 'treatment'),
all = T) %>%
mutate(sig = case_when(sig < 0.1 ~ "< 0.1",
.default = "> 0.1")) %>%
#Baloon plot
ggballoonplot(size = "value", y = "feature", x= "treatment", fill = "sig") +
theme_classic(base_family = "serif") +
#colors for qvalues
gradient_fill(c("#006d2c", "#edf8fb")) +
xlab("Experimental group") +
ylab("Species") +
labs(tag = "D") +
theme(panel.grid.major = element_line(colour = "grey"),
legend.position = "top",
axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
#Element markdown for taxa name italicizing
axis.text.y = ggtext::element_markdown(size = 7),
axis.title.x = element_text(margin = margin(t = 20))) +
#Adding significance asterisks
scale_fill_manual(values = c("red", "grey"), aes(y = feature,
x = treatment,
label = sig)) +
guides(fill = guide_legend(title = c(expression(paste(italic("q"),
"-value",
sep = ""))),
title.position = "top"),
size = guide_legend(title = "Relative abundance",
title.position = "top",
order = 1,
nrow = 1),
) +
scale_x_discrete(labels=c("control" = "Untreated",
"lypma" = "lyPMA",
"benzonase" = "Benzonase",
"host_zero" = "Host-zero",
"molysis" = "Molysis",
"qiaamp" = "QIAamp")
)
ggarrange(f5a, f5b, f5c, f5d, align = "hv", common.legend = T)
A7. LM of function alpha diversity
Function richness - LMER all samples
Effect of some treatment was sample_type specific.
Effect size, standard error (SE) and p-value of a statistical test for function richness with an interaction term using linear mixed effect model (Species richness ~ sample type * treatment + log10 (final reads) + (1|subject_id) ).
sample_data <- sample_data(phyloseq$phyloseq_path_rpk)
sample_data$log_centered_final_reads <- log(sample_data$Final_reads + 1) - median(log((subset(sample_data, sample_data$sample_type %in% c("BAL") & sample_data$treatment %in% c("Untreated")) %>% .$Final_reads) + 1))
lmer(S.obs ~ sample_type * treatment + log_centered_final_reads + (1|subject_id), data = sample_data %>% data.frame %>% subset(., .$sample_type %in% c("BAL", "Nasal", "Sputum"))) %>%
summary() %>%
.$coefficients %>%
data.frame(check.names = F) %>%
mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.001 ~ "***",
abs(`Pr(>|t|)`) < 0.01 ~ "**",
abs(`Pr(>|t|)`) < 0.05 ~ "*",
.default = " ")) %>%
rownames_to_column(var = "x") %>% mutate(x = gsub("sample_type|treatment", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
column_to_rownames(var = "x") %>%
rename("<i>p</i>-value" = "Pr(>|t|)",
SE = "Std. Error") %>%
mutate("Effect size (95% CI)" = paste(round(Estimate, 1) %>% format(nsmall = 1),
" (",
round(Estimate - 1.96 * SE, 1) %>% format(nsmall = 1),
", ",
round(Estimate + 1.96 * SE, 1) %>% format(nsmall = 1),
")",
sep = ""),
"<i>p</i>-value" = round(`<i>p</i>-value`, 3)) %>%
select(c("Effect size (95% CI)", "<i>p</i>-value", " ")) %>%
kbl(format = "html", escape = 0) %>% kable_styling(full_width = 0, html_font = "serif")
| Effect size (95% CI) | p-value | ||
|---|---|---|---|
| (Intercept) | 25.5 ( -14.7, 65.7) | 0.221 | |
| Nasal | 54.7 ( 1.2, 108.1) | 0.050 | |
| Sputum | 129.0 ( 71.8, 186.3) | 0.000 | *** |
| lyPMA | 37.0 ( -5.6, 79.6) | 0.094 | |
| Benzonase | 78.2 ( 33.1, 123.3) | 0.001 | ** |
| Host zero | 109.1 ( 62.9, 155.3) | 0.000 | *** |
| Molysis | 127.6 ( 80.6, 174.5) | 0.000 | *** |
| QIAamp | 65.4 ( 18.3, 112.5) | 0.008 | ** |
| log_centered_final_reads | 27.8 ( 19.1, 36.6) | 0.000 | *** |
| Nasal * lyPMA | 14.9 ( -44.8, 74.6) | 0.626 | |
| Sputum * lyPMA | 9.4 ( -50.2, 68.9) | 0.759 | |
| Nasal * Benzonase | -73.5 (-132.0, -15.0) | 0.017 |
|
| Sputum * Benzonase | -47.6 (-107.1, 11.8) | 0.121 | |
| Nasal * Host zero | -107.0 (-164.1, -50.0) | 0.000 | *** |
| Sputum * Host zero | -78.8 (-140.0, -17.6) | 0.014 |
|
| Nasal * Molysis | -95.9 (-155.2, -36.7) | 0.002 | ** |
| Sputum * Molysis | -112.0 (-174.5, -49.5) | 0.001 | *** |
| Nasal * QIAamp | -58.0 (-115.1, -1.0) | 0.050 | |
| Sputum * QIAamp | -40.6 (-100.5, 19.3) | 0.189 |
Function richness - stratified
Stratified analysis will be conducted.
sr_lmer_bal <- lmer(S.obs ~ treatment + log10 (Final_reads) + (1|subject_id), data = sample_data %>% data.frame %>% subset(., .$sample_type %in% c("BAL"))) %>%
summary() %>%
.$coefficients %>%
data.frame(check.names = F) %>%
mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.001 ~ "***",
abs(`Pr(>|t|)`) < 0.01 ~ "**",
abs(`Pr(>|t|)`) < 0.05 ~ "*",
.default = " ")) %>%
rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
mutate(x = gsub("10", "<sub>10</sub>", x)) %>% mutate(x = gsub("_", " ", x)) %>%
column_to_rownames(var = "x") %>%
mutate(across(is.numeric, round, digits=2))%>%
rename("<i>p</i>-value" = "Pr(>|t|)",
SE = "Std. Error",
"Effect size" = "Estimate") %>%
select(c("Effect size", "SE", "<i>p</i>-value", " "))
sr_lmer_ns <- lmer(S.obs ~ treatment + log10 (Final_reads) + (1|subject_id), data = sample_data %>% data.frame %>% subset(., .$sample_type %in% c("BAL"))) %>%
summary() %>%
.$coefficients %>%
data.frame(check.names = F) %>%
mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.001 ~ "***",
abs(`Pr(>|t|)`) < 0.01 ~ "**",
abs(`Pr(>|t|)`) < 0.05 ~ "*",
.default = " ")) %>%
rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
mutate(x = gsub("10", "<sub>10</sub>", x)) %>% mutate(x = gsub("_", " ", x)) %>%
column_to_rownames(var = "x") %>%
mutate(across(is.numeric, round, digits=2)) %>%
rename("<i>p</i>-value" = "Pr(>|t|)",
SE = "Std. Error",
"Effect size" = "Estimate") %>%
select(c("Effect size", "SE", "<i>p</i>-value", " "))
sr_lmer_spt <- lmer(S.obs ~ treatment + log10 (Final_reads) + (1|subject_id), data = sample_data %>% data.frame %>% subset(., .$sample_type %in% c("BAL"))) %>%
summary() %>%
.$coefficients %>%
data.frame(check.names = F) %>%
mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.001 ~ "***",
abs(`Pr(>|t|)`) < 0.01 ~ "**",
abs(`Pr(>|t|)`) < 0.05 ~ "*",
.default = " ")) %>%
rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
mutate(x = gsub("10", "<sub>10</sub>", x)) %>% mutate(x = gsub("_", " ", x)) %>%
column_to_rownames(var = "x") %>%
mutate(across(is.numeric, round, digits=2)) %>%
rename("<i>p</i>-value" = "Pr(>|t|)",
SE = "Std. Error",
"Effect size" = "Estimate") %>%
select(c("Effect size", "SE", "<i>p</i>-value", " "))
cbind(sr_lmer_bal, sr_lmer_ns, sr_lmer_spt) %>%
kbl(format = "html", escape = 0) %>%
add_header_above(c(" " = 1, "BAL" = 4, "Nasal swab" = 4,"Sputum"= 4)) %>%
kable_styling(full_width = 0, html_font = "serif")
| Effect size | SE | p-value | Effect size | SE | p-value | Effect size | SE | p-value | ||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| (Intercept) | -594.95 | 96.84 | 0.00 | *** | -594.95 | 96.84 | 0.00 | *** | -594.95 | 96.84 | 0.00 | *** |
| lyPMA | 19.95 | 25.60 | 0.45 | 19.95 | 25.60 | 0.45 | 19.95 | 25.60 | 0.45 | |||
| Benzonase | 38.89 | 28.53 | 0.19 | 38.89 | 28.53 | 0.19 | 38.89 | 28.53 | 0.19 | |||
| Host zero | 63.01 | 29.78 | 0.05 |
|
63.01 | 29.78 | 0.05 |
|
63.01 | 29.78 | 0.05 |
|
| Molysis | 77.37 | 30.62 | 0.02 |
|
77.37 | 30.62 | 0.02 |
|
77.37 | 30.62 | 0.02 |
|
| QIAamp | 14.58 | 30.74 | 0.64 | 14.58 | 30.74 | 0.64 | 14.58 | 30.74 | 0.64 | |||
| log10(Final reads) | 112.60 | 17.23 | 0.00 | *** | 112.60 | 17.23 | 0.00 | *** | 112.60 | 17.23 | 0.00 | *** |
A8. permanova of function alpha diversity
Function beta - all samples (Table S11)
Table S11. Degree of freedom, effect size (residual, R^2) and p-value of permutational ANOVA for functional Horn-Morisita distances with an interaction term and strata term (BC-distance of functions ~ sample type * treatment + log10(final reads), strata = subject id).
phyloseq_rel_nz <- transform_sample_counts(phyloseq$phyloseq_path_rpk, function(x) {x/sum(x)}) %>%
subset_samples(S.obs != 0 & sample_type %in% c("Mock", "BAL", "Nasal", "Sputum"))
bray_perm_inter <- vegan::adonis2(distance(phyloseq_rel_nz, method="horn") ~ sample_type * treatment + subject_id + log10(Final_reads),
data = phyloseq_rel_nz %>% sample_data %>% data.frame(check.names = F),
strata = phyloseq_rel_nz %>% sample_data %>% data.frame(check.names = F) %>% .$subject_id,
permutations = 10000)
bray_perm_ns <- vegan::adonis2(distance(subset_samples(phyloseq_rel_nz, sample_type == "Nasal"), method="horn") ~ lypma + benzonase + host_zero + molysis + qiaamp + log10(Final_reads),
data = subset_samples(phyloseq_rel_nz, sample_type == "Nasal") %>%
sample_data %>% data.frame(check.names = F),
strata = subset_samples(phyloseq_rel_nz, sample_type == "Nasal") %>%
sample_data %>% data.frame(check.names = F) %>% .$subject_id, permutations = 10000)
bray_perm_bal <- vegan::adonis2(distance(subset_samples(phyloseq_rel_nz, sample_type == "BAL"), method="horn") ~ lypma + benzonase + host_zero + molysis + qiaamp + log10(Final_reads),
data = subset_samples(phyloseq_rel_nz, sample_type == "BAL") %>% sample_data %>% data.frame(check.names = F),
strata = subset_samples(phyloseq_rel_nz, sample_type == "BAL") %>%
sample_data %>% data.frame(check.names = F) %>% .$subject_id,
permutations = 10000)
bray_perm_spt <- vegan::adonis2(distance(subset_samples(phyloseq_rel_nz, sample_type == "Sputum"), method="horn") ~ lypma + benzonase + host_zero + molysis + qiaamp + log10(Final_reads),
data = subset_samples(phyloseq_rel_nz, sample_type == "Sputum") %>% sample_data %>% data.frame(check.names = F),
strata = subset_samples(phyloseq_rel_nz, sample_type == "Sputum")
%>% sample_data %>% data.frame(check.names = F) %>% .$subject_id,
permutations = 10000)
tableS11 <- bray_perm_inter %>% data.frame(check.names = F) %>% rownames_to_column("row.names") %>%
mutate(row.names = case_when(row.names == "sample_type" ~ 'Sample type',
row.names == "treatment" ~ 'Treatment',
row.names == "subject_id" ~ 'Subject',
row.names == "log10(Final_reads)" ~ 'log10(Final reads)',
row.names == "sample_type:treatment" ~ 'Sample type * Treatment',
row.names == "Residual" ~ 'Residual',
row.names == "Total" ~ 'Total')) %>% column_to_rownames('row.names') %>%
mutate(` ` = case_when(abs(`Pr(>F)`) < 0.001 ~ "***",
abs(`Pr(>F)`) < 0.01 ~ "**",
abs(`Pr(>F)`) < 0.05 ~ "*",
.default = " ")) %>%
mutate(across(is.numeric, round, digits=3),
`Pr(>F)` = format(`Pr(>F)`, nsmall = 3)) %>%
rename("<i>p</i>-value" = "Pr(>F)",
"R<sup>2</sup>" = "R2",
"Degree of freedom" = "Df") %>%
select(c("Degree of freedom", "R<sup>2</sup>", "<i>p</i>-value", " ")) %>%
kbl(format = "html", escape = 0) %>%
kable_styling(full_width = 0, html_font = "serif")
tableS11
| Degree of freedom | R2 | p-value | ||
|---|---|---|---|---|
| Sample type | 3 | 0.556 | 0.002 | ** |
| Treatment | 5 | 0.069 | 0.001 | *** |
| Subject | 17 | 0.252 | 0.001 | *** |
| log10(Final reads) | 1 | 0.036 | 0.000 | *** |
| Sample type * Treatment | 15 | 0.033 | 0.096 | |
| Residual | 81 | 0.054 | NA | |
| Total | 122 | 1.000 | NA |
save_kable(tableS11, file = "/Users/minsikkim/Dropbox (Partners HealthCare)/Project_SICAS2_microbiome/7_Manuscripts/2022_MGK_Host_Depletion/Figures/tableS11.html", self_contained = T)
Function beta - stratified
a <- bray_perm_bal %>% data.frame(check.names = F) %>% rownames_to_column('row.names') %>%
mutate(row.names = case_when(row.names == "lypma" ~ 'lyPMA',
row.names == "benzonase" ~ 'Benzonase',
row.names == "host_zero" ~ 'Host zero',
row.names == "molysis" ~ 'Molysis',
row.names == "qiaamp" ~ 'QIAamp',
row.names == "subject_id" ~ 'Subject id',
row.names == "log10(Final_reads)" ~ 'log10(Final reads)',
row.names == "Residual" ~ 'Residual',
row.names == "Total" ~ 'Total')) %>% column_to_rownames('row.names') %>%
mutate(` ` = case_when(abs(`Pr(>F)`) < 0.001 ~ "***",
abs(`Pr(>F)`) < 0.01 ~ "**",
abs(`Pr(>F)`) < 0.05 ~ "*",
.default = " ")) %>%
mutate(across(is.numeric, round, digits=3)) %>%
rename("<i>p</i>-value" = "Pr(>F)",
"R<sup>2</sup>" = "R2",
"Degree of freedom" = "Df") %>%
select(c("R<sup>2</sup>", "<i>p</i>-value", " "))
b <- bray_perm_ns %>% data.frame(check.names = F) %>% rownames_to_column('row.names') %>%
mutate(row.names = case_when(row.names == "lypma" ~ 'lyPMA',
row.names == "benzonase" ~ 'Benzonase',
row.names == "host_zero" ~ 'Host zero',
row.names == "molysis" ~ 'Molysis',
row.names == "qiaamp" ~ 'QIAamp',
row.names == "subject_id" ~ 'Subject id',
row.names == "log10(Final_reads)" ~ 'log10(Final reads)',
row.names == "Residual" ~ 'Residual',
row.names == "Total" ~ 'Total')) %>% column_to_rownames('row.names') %>%
mutate(` ` = case_when(abs(`Pr(>F)`) < 0.001 ~ "***",
abs(`Pr(>F)`) < 0.01 ~ "**",
abs(`Pr(>F)`) < 0.05 ~ "*",
.default = " ")) %>%
mutate(across(is.numeric, round, digits=3)) %>%
rename("<i>p</i>-value" = "Pr(>F)",
"R<sup>2</sup>" = "R2",
"Degree of freedom" = "Df") %>%
select(c("R<sup>2</sup>", "<i>p</i>-value", " "))
c <- bray_perm_spt %>% data.frame(check.names = F) %>% rownames_to_column('row.names') %>%
mutate(row.names = case_when(row.names == "lypma" ~ 'lyPMA',
row.names == "benzonase" ~ 'Benzonase',
row.names == "host_zero" ~ 'Host zero',
row.names == "molysis" ~ 'Molysis',
row.names == "qiaamp" ~ 'QIAamp',
row.names == "subject_id" ~ 'Subject id',
row.names == "log10(Final_reads)" ~ 'log10(Final reads)',
row.names == "Residual" ~ 'Residual',
row.names == "Total" ~ 'Total')) %>% column_to_rownames('row.names') %>%
mutate(` ` = case_when(abs(`Pr(>F)`) < 0.001 ~ "***",
abs(`Pr(>F)`) < 0.01 ~ "**",
abs(`Pr(>F)`) < 0.05 ~ "*",
.default = " ")) %>%
mutate(across(is.numeric, round, digits=3)) %>%
rename("<i>p</i>-value" = "Pr(>F)",
"R<sup>2</sup>" = "R2",
"Degree of freedom" = "Df") %>%
select(c("R<sup>2</sup>", "<i>p</i>-value", " "))
cbind(a, b, c) %>%
kbl(format = "html", escape = 0) %>%
add_header_above(c(" " = 1, "BAL" = 3, "Nasal swab" = 3,"Sputum"= 3)) %>%
kable_styling(full_width = 0, html_font = "serif")
| R2 | p-value | R2 | p-value | R2 | p-value | ||||
|---|---|---|---|---|---|---|---|---|---|
| lyPMA | -0.034 | 0.962 | -0.032 | 0.992 | -0.083 | 1 | |||
| Benzonase | 0.069 | 0.082 | 0.002 | 0.634 | -0.062 | 1 | |||
| Host zero | 0.017 | 0.292 | 0.042 | 0.243 | 0.120 | 0 | *** | ||
| Molysis | 0.125 | 0.034 |
|
0.084 | 0.096 | 0.258 | 0 | *** | |
| QIAamp | -0.012 | 0.900 | 0.357 | 0.000 | *** | 0.687 | 0 | *** | |
| log10(Final reads) | 0.414 | 0.090 | 0.142 | 0.025 |
|
0.066 | 0 | *** | |
| Residual | 0.422 | NA | 0.405 | NA | 0.014 | NA | |||
| Total | 1.000 | NA | 1.000 | NA | 1.000 | NA |
A9. DA for function
Function DA analysis
#DA analysis - MaAslin
sample_data(phyloseq_rel_nz)$log10.Final_reads <- log10(sample_data(phyloseq_rel_nz)$Final_reads)
#Running MaAslin for all sample without decontam
#for taxa differentially abundant by host depletion method, look to see which ones overlap with potential contaminant taxa
# Maaslin - # # y ~ log(final reads) + sample_type + treatment -----------
#all samples
f_maaslin_all <- read.csv("/Users/minsikkim/Dropbox (Partners HealthCare)/Project_SICAS2_microbiome/5_Scripts/MGK/Host_depletion_git/data/filt_f_maaslin_all.csv")
f_fit_data_bal <- read.csv("/Users/minsikkim/Dropbox (Partners HealthCare)/Project_SICAS2_microbiome/5_Scripts/MGK/Host_depletion_git/data/filt_f_fit_data_bal.csv")
f_fit_data_spt <- read.csv("/Users/minsikkim/Dropbox (Partners HealthCare)/Project_SICAS2_microbiome/5_Scripts/MGK/Host_depletion_git/data/filt_f_fit_data_spt.csv")
f_fit_data_ns <- read.csv("/Users/minsikkim/Dropbox (Partners HealthCare)/Project_SICAS2_microbiome/5_Scripts/MGK/Host_depletion_git/data/filt_f_fit_data_ns.csv")
f_fit_data_pos <- read.csv("/Users/minsikkim/Dropbox (Partners HealthCare)/Project_SICAS2_microbiome/5_Scripts/MGK/Host_depletion_git/data/filt_f_fit_data_pos.csv")
Again, most of DA functions were sample type specific
#Making significance table for figure
# Define a function to make species names italicized
# Make a significance table for each figure (top 20 taxa)
make_sig_table <- function(data) {
sig_data <- spread(data[order(data$qval), c("feature", "metadata", "qval")], metadata, qval)
sig_data$feature <- gsub("[.]", "-", sig_data$feature)
sig_data$min <- apply(sig_data, 1, FUN = min)
sig_data <- sig_data[order(sig_data$min),] %>% select("feature", "lypma", "benzonase", "host_zero", "molysis", "qiaamp") %>% .[1:20,]
sig_data[["feature"]] <- ifelse(sig_data[["feature"]] == "X.Collinsella._massiliensis", "[Collinsella]_massiliensis", sig_data[["feature"]])
sig_data_italic <- sig_data %>% rownames_to_column(var = "-") %>%
column_to_rownames(var = "feature") %>% select(-c("-")) %>%
rename(lyPMA = lypma, Benzonase = benzonase, `Host zero` = host_zero, Molysis = molysis, QIAamp = qiaamp)
sig_data_sig <- ifelse(sig_data_italic < 0.1, "*", NA) %>% data.frame(check.names = F)
return(list(data = sig_data, data_italic = sig_data_italic, data_sig = sig_data_sig))
}
f_fit_data_pos <- make_sig_table(f_fit_data_pos)
f_fit_data_bal <- make_sig_table(f_fit_data_bal)
f_fit_data_ns <- make_sig_table(f_fit_data_ns)
f_fit_data_spt <- make_sig_table(f_fit_data_spt)
f_pos_sig <- subset_taxa(subset_samples(phyloseq_rel_nz, sample_type == "Mock"),
taxa_names(subset_samples(phyloseq_rel_nz, sample_type == "Mock")) %in% f_fit_data_pos$data$feature)
f_fit_data_pos$rel <- cbind(f_pos_sig %>% otu_table %>% t, f_pos_sig %>% sample_data) %>% group_by(treatment) %>% summarise_if(is.numeric, mean, na.rm = TRUE) %>% .[, 1:21] %>% column_to_rownames(., "treatment") %>% t () %>% data.frame(check.names = F) %>%
.[row.names(f_fit_data_pos$data_italic),] %>% mutate_all(~na_if(., 0)) %>% rownames_to_column("feature")
f_spt_sig <- subset_taxa(subset_samples(phyloseq_rel_nz, sample_type == "Sputum"),
taxa_names(subset_samples(phyloseq_rel_nz, sample_type == "Sputum")) %in% f_fit_data_spt$data$feature)
f_fit_data_spt$rel <- cbind(f_spt_sig %>% otu_table %>% t, f_spt_sig %>% sample_data) %>% group_by(treatment) %>% summarise_if(is.numeric, mean, na.rm = TRUE) %>% .[, 1:21] %>% column_to_rownames(., "treatment") %>% t () %>% data.frame(check.names = F) %>%
.[row.names(f_fit_data_spt$data_italic),] %>% mutate_all(~na_if(., 0)) %>% rownames_to_column("feature")
f_ns_sig <- subset_taxa(subset_samples(phyloseq_rel_nz, sample_type == "Nasal"),
taxa_names(subset_samples(phyloseq_rel_nz, sample_type == "Nasal")) %in% f_fit_data_ns$data$feature)
f_fit_data_ns$rel <- cbind(f_ns_sig %>% otu_table %>% t, f_ns_sig %>% sample_data) %>% group_by(treatment) %>% summarise_if(is.numeric, mean, na.rm = TRUE) %>% .[, 1:21] %>% column_to_rownames(., "treatment") %>% t () %>% data.frame(check.names = F) %>%
.[row.names(f_fit_data_ns$data_italic),] %>% mutate_all(~na_if(., 0)) %>% rownames_to_column("feature")
f_fit_data_ns$rel$feature <- row.names(f_fit_data_ns$data_sig)
f_bal_sig <- subset_taxa(subset_samples(phyloseq_rel_nz, sample_type == "BAL"),
taxa_names(subset_samples(phyloseq_rel_nz, sample_type == "BAL")) %in% f_fit_data_bal$data$feature)
f_fit_data_bal$rel <- cbind(f_bal_sig %>% otu_table %>% t, f_bal_sig %>% sample_data) %>% group_by(treatment) %>% summarise_if(is.numeric, mean, na.rm = TRUE) %>% .[, 1:21] %>% column_to_rownames(., "treatment") %>% t () %>% data.frame(check.names = F) %>%
.[row.names(f_fit_data_bal$data_italic),] %>%
mutate_all(~na_if(., 0)) %>% rownames_to_column("feature")
MaAslin function - volcano plot (Fig. S9)
Figure S9. Volcano plot of sequencing depth adjusted differential abundance of function by each treatment
#Volcano plot
figS9 <- ggplot(f_maaslin_all, aes(y = -log10(qval), x = coef, col = metadata)) +
theme_classic(base_family = "serif") +
#labs(tag = "A") +
geom_point(size = 2) +
xlab("MaAslin coefficient") +
ylab("-log<sub>10</sub>(*q*-value)") +
geom_hline(yintercept = 1, col = "gray") +
geom_vline(xintercept = 0, col = "gray") +
annotate(family = "serif",
geom='richtext',
x=0, y=80,
label = "<i>q</i>-value = 0.1, fold-change = 0") +
theme(legend.position = "top", axis.title.y = ggtext::element_markdown()) +
scale_color_manual(values = c("#4daf4a", "#984ea3", "#f781bf", "#377eb8", "#ff7f00", "#ffff33", "#a65628"),
breaks = c("log10.Final_reads", "sample_type", "lypma", "benzonase", "host_zero", "molysis", "qiaamp"),
labels = c("log10(Final reads)", "Sample type", "lyPMA", "Benzonase", "Host zero", "Molysis", "QIAamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
guides(col = guide_legend(title = "Covariates", title.position = "top", nrow = 2))
png(file = "/Users/minsikkim/Dropbox (Partners HealthCare)/Project_SICAS2_microbiome/7_Manuscripts/2022_MGK_Host_Depletion/Figures/FigureS9.png", # The directory you want to save the file in
width = 180, # The width of the plot in inches
height = 90, # The height of the plot in inches
units = "mm",
res = 600
) #fixing multiple page issue
figS9
# alpha diversity plots
#ggarrange(f4ad, ggarrange(f4e, f4f, ncol = 2),
# ncol = 1) # alpha diversity plots
dev.off()
## quartz_off_screen
## 2
figS9
Function baloon plot (Fig. S10)
Figure S10. Mean relative abundance of top 20 significant function identified by differential abundance analysis using MaAsLin. Analyses were stratified by sample type. (A) Mock community, (B) bronchoalveolor lavage, (C) nasal swabs, and (D) sputum. Statistical significances were noted at the level of q-value < 0.1
f5a <- merge(f_fit_data_pos$rel %>%
gather(treatment,
value,
Untreated:QIAamp,
factor_key=TRUE),
f_fit_data_pos$data_italic %>%
rownames_to_column("feature") %>%
gather(treatment,
qval,
lyPMA:QIAamp,
factor_key=TRUE),
by.x = c('feature', 'treatment'),
by.y = c('feature', 'treatment'),
all = T) %>%
merge(f_fit_data_pos$data_sig %>%
rownames_to_column("feature") %>%
gather(treatment,
sig,
lyPMA:QIAamp,
factor_key=TRUE),
by.x = c('feature', 'treatment'),
by.y = c('feature', 'treatment'),
all = T) %>%
mutate(sig = case_when(sig < 0.1 ~ "< 0.1",
.default = "> 0.1"),
value = value * 1000000) %>%
#Baloon plot
ggballoonplot(size = "value", y = "feature", x= "treatment", fill = "sig") +
theme_classic(base_family = "serif") +
#colors for qvalues
gradient_fill(c("#006d2c", "#edf8fb")) +
xlab("Experimental group") +
ylab("Function") +
labs(tag = "A") +
theme(panel.grid.major = element_line(colour = "grey"),
legend.position = "top",
axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
#Element markdown for taxa name italicizing
axis.text.y = ggtext::element_markdown(size = 7),
axis.title.x = element_text(margin = margin(t = 20))) +
#Adding significance asterisks
scale_fill_manual(values = c("red", "grey"), aes(y = feature,
x = treatment,
label = sig)) +
guides(fill = guide_legend(title = c(expression(paste(italic("q"),
"-value",
sep = ""))),
title.position = "top"),
size = guide_legend(title = "Copies per million",
title.position = "top",
order = 1,
nrow = 1),
) +
scale_x_discrete(labels=c("control" = "Untreated",
"lypma" = "lyPMA",
"benzonase" = "Benzonase",
"host_zero" = "Host-zero",
"molysis" = "Molysis",
"qiaamp" = "QIAamp")
)
#ffff33 qia
f5b <- merge(f_fit_data_bal$rel %>%
gather(treatment,
value,
Untreated:QIAamp,
factor_key=TRUE),
f_fit_data_bal$data_italic %>%
rownames_to_column("feature") %>%
gather(treatment,
qval,
lyPMA:QIAamp,
factor_key=TRUE),
by.x = c('feature', 'treatment'),
by.y = c('feature', 'treatment'),
all = T) %>%
merge(f_fit_data_bal$data_sig %>%
rownames_to_column("feature") %>%
gather(treatment,
sig,
lyPMA:QIAamp,
factor_key=TRUE),
by.x = c('feature', 'treatment'),
by.y = c('feature', 'treatment'),
all = T) %>%
mutate(sig = case_when(sig < 0.1 ~ "< 0.1",
.default = "> 0.1"),
value = value * 1000000) %>%
#Baloon plot
ggballoonplot(size = "value", y = "feature", x= "treatment", fill = "sig") +
theme_classic(base_family = "serif") +
#colors for qvalues
xlab("Experimental group") +
ylab("Function") +
labs(tag = "B") +
theme(panel.grid.major = element_line(colour = "grey"),
legend.position = "top",
axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
#Element markdown for taxa name italicizing
axis.text.y = ggtext::element_markdown(size = 7),
axis.title.x = element_text(margin = margin(t = 20))) +
scale_fill_manual(values = c("grey", "red"), aes(y = feature,
x = treatment,
label = sig)) +
guides(fill = guide_legend(title = c(expression(paste(italic("q"),
"-value",
sep = ""))),
title.position = "top"),
size = guide_legend(title = "Copies per million",
title.position = "top",
order = 1,
nrow = 1),
) +
scale_x_discrete(labels=c("control" = "Untreated",
"lypma" = "lyPMA",
"benzonase" = "Benzonase",
"host_zero" = "Host-zero",
"molysis" = "Molysis",
"qiaamp" = "QIAamp")
)
f5c <- merge(f_fit_data_ns$rel %>%
gather(treatment,
value,
Untreated:QIAamp,
factor_key=TRUE),
f_fit_data_ns$data_italic %>%
rownames_to_column("feature") %>%
gather(treatment,
qval,
lyPMA:QIAamp,
factor_key=TRUE),
by.x = c('feature', 'treatment'),
by.y = c('feature', 'treatment'),
all = T) %>%
merge(f_fit_data_ns$data_sig %>%
rownames_to_column("feature") %>%
gather(treatment,
sig,
lyPMA:QIAamp,
factor_key=TRUE),
by.x = c('feature', 'treatment'),
by.y = c('feature', 'treatment'),
all = T) %>%
mutate(sig = case_when(sig < 0.1 ~ "< 0.1",
.default = "> 0.1"),
value = value * 1000000) %>%
#Baloon plot
ggballoonplot(size = "value", y = "feature", x= "treatment", fill = "sig") +
theme_classic(base_family = "serif") +
#colors for qvalues
gradient_fill(c("#006d2c", "#edf8fb")) +
xlab("Experimental group") +
ylab("Function") +
labs(tag = "C") +
theme(panel.grid.major = element_line(colour = "grey"),
legend.position = "top",
axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
#Element markdown for taxa name italicizing
axis.text.y = ggtext::element_markdown(size = 7),
axis.title.x = element_text(margin = margin(t = 20))) +
#Adding significance asterisks
scale_fill_manual(values = c("red", "grey"), aes(y = feature,
x = treatment,
label = sig)) +
guides(fill = guide_legend(title = c(expression(paste(italic("q"),
"-value",
sep = ""))),
title.position = "top"),
size = guide_legend(title = "Copies per million",
title.position = "top",
order = 1,
nrow = 1),
) +
scale_x_discrete(labels=c("control" = "Untreated",
"lypma" = "lyPMA",
"benzonase" = "Benzonase",
"host_zero" = "Host-zero",
"molysis" = "Molysis",
"qiaamp" = "QIAamp")
)
f5d <- merge(f_fit_data_spt$rel %>%
gather(treatment,
value,
Untreated:QIAamp,
factor_key=TRUE),
f_fit_data_spt$data_italic %>%
rownames_to_column("feature") %>%
gather(treatment,
qval,
lyPMA:QIAamp,
factor_key=TRUE),
by.x = c('feature', 'treatment'),
by.y = c('feature', 'treatment'),
all = T) %>%
merge(f_fit_data_spt$data_sig %>%
rownames_to_column("feature") %>%
gather(treatment,
sig,
lyPMA:QIAamp,
factor_key=TRUE),
by.x = c('feature', 'treatment'),
by.y = c('feature', 'treatment'),
all = T) %>%
mutate(sig = case_when(sig < 0.1 ~ "< 0.1",
.default = "> 0.1"),
value = value * 1000000) %>%
#Baloon plot
ggballoonplot(size = "value", y = "feature", x= "treatment", fill = "sig") +
theme_classic(base_family = "serif") +
#colors for qvalues
gradient_fill(c("#006d2c", "#edf8fb")) +
xlab("Experimental group") +
ylab("Function") +
labs(tag = "D") +
theme(panel.grid.major = element_line(colour = "grey"),
legend.position = "top",
axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
#Element markdown for taxa name italicizing
axis.text.y = ggtext::element_markdown(size = 7),
axis.title.x = element_text(margin = margin(t = 20))) +
#Adding significance asterisks
scale_fill_manual(values = c("red", "grey"), aes(y = feature,
x = treatment,
label = sig)) +
guides(fill = guide_legend(title = c(expression(paste(italic("q"),
"-value",
sep = ""))),
title.position = "top"),
size = guide_legend(title = "Copies per millio",
title.position = "top",
order = 1,
nrow = 1),
) +
scale_x_discrete(labels=c("control" = "Untreated",
"lypma" = "lyPMA",
"benzonase" = "Benzonase",
"host_zero" = "Host-zero",
"molysis" = "Molysis",
"qiaamp" = "QIAamp")
)
figS10 <- ggarrange(f5a, f5b, f5c, f5d, align = "hv", common.legend = T)
figS10
png(file = "/Users/minsikkim/Dropbox (Partners HealthCare)/Project_SICAS2_microbiome/7_Manuscripts/2022_MGK_Host_Depletion/Figures/FigureS10.png", # The directory you want to save the file in
width = 180, # The width of the plot in inches
height = 170, # The height of the plot in inches
units = "mm",
res = 600
) #fixing multiple page issue
figS10
# alpha diversity plots
#ggarrange(f4ad, ggarrange(f4e, f4f, ncol = 2),
# ncol = 1) # alpha diversity plots
dev.off()
## quartz_off_screen
## 2
A10. Rarefaction analysis
As a sanity check, rarefaction based analysis was conducted using LDM
Rarefaction analsyis
Rarefaction stratified (BAL)
args=(commandArgs(TRUE))
if(length(args)==0){
print("No arguments supplied")
dist_type <- 5 # 1:"horn"; 5: "jaccard"; 4: "unwt-unifrac"
data_full <- 1 # 1: full sample; 0: a subsample of 50 subjects
i_seed <- seed # seed for generating different subsamples
} else{
for(i in 1:length(args)){
eval(parse(text=args[[i]]))
}
}
## [1] "No arguments supplied"
if (dist_type==5) {
dist_method = "jaccard"
} else if (dist_type==4) {
dist_method = "unwt-unifrac"
} else if (dist_type==1) {
dist_method = "horn"
}
###############################################
# read in data
###############################################
# read in otu table, taxonomy info
#dat <- read_biom("rawData/otu_table.biom")
#otu_table <- as.data.frame(as.matrix(biom_data(dat)))
#taxonomy <- observation_metadata(dat)
#crohn.tree <- read.tree("rawData/insertion_tree.relabelled.tre")
otu_table_col <- as.data.frame(t(otu_table(phyloseq$phyloseq_count %>%
subset_samples(sample_type %in% c("BAL"))))) #make samples rows, OTUs columns
otu_table_col <- otu_table_col[,colSums(otu_table_col>0)>0] # Removing data with zeros
# read in metadata
metadata <- sample_data(phyloseq$phyloseq_count %>%
subset_samples(sample_type %in% c("BAL"))) %>% data.frame()
# create dichotomous variable for diagnosis
#metadata <- cbind(metadata,ifelse(metadata$diagnosis=="CD" | metadata$diagnosis=="IC" | metadata$diagnosis=="UC", 1, 0))
#colnames(metadata)[61] <- "disease"
# sort metadata to match otu table
meta.sorted <- metadata[order(match(metadata$baylor_other_id, rownames(otu_table_col))),]
# remove duplicates at the end
#meta.site.dup <- subset(meta.sorted, collection=="RISK")
#meta.sub.dup <- subset(meta.site.dup, type_sample=="biopsy" & biopsy_location=="Rectum")
# want subset with only biopsy samples, no follow-up measurements, no missing values in key covariates
#no.follow <- meta.sorted[!duplicated(meta.sorted$anonymized_name),]
#meta.site <- subset(no.follow, collection=="RISK")
#meta.sub <- subset(meta.site, type_sample=="biopsy" & biopsy_location=="Rectum")
# match otu table with cleaned meta data
otu.sub <- subset(otu_table_col, rownames(otu_table_col) %in% metadata$baylor_other_id) %>% filter(.,)
###Prevalence filtration & decontam was not processed as phyloseq was alread filtered
# filter otu table
pa.table <- 1*(otu.sub > 0) #Presence absence table
#otu.sub.f <- otu.sub[, -which(colSums(pa.table)<5)] #Prevalence filtering
#dim(otu.sub)
#dim(otu.sub.f) # 16 taxa were removed
# remove obs with < 5000 reads
otu.full <- otu.sub[rowSums(otu.sub)>5000,] #samples with too low reads were removed
#Subsetting samples with effective reads
meta.sub <- subset(meta.sorted, meta.sorted$baylor_other_id %in% rownames(otu.full))
dim(otu.sub)
## [1] 30 88
dim(otu.full) #14 samples were thrown out
## [1] 27 88
min(rowSums(otu.full))
## [1] 10612
meta.full <- subset(meta.sub, meta.sub$baylor_other_id %in% rownames(otu.full))
#Making data of t-tests - Not usefull for my analysis
#Anyway ran for differences between control vs treatment
p.t.full = t.test(rowSums(otu.full) ~ meta.full$control, var.equal=FALSE)$p.value
p.wilcox.full = wilcox.test(rowSums(otu.full) ~ meta.full$control)$p.value
#1st. We will check all the samples, non-rarefied
########################
# prepare data for anal
########################
if (data_full==1) {
otu.tab = otu.full
meta.data = meta.full
} else {
otu.tab = otu.sam10
meta.data = meta.sam10
}
dim(otu.tab)
## [1] 27 88
dim(meta.data)
## [1] 27 66
########################
# permanova-F
########################
#LDM-F: averaging the F statistics over all rarefactions,
if (data_full==1) {
formula = otu.tab ~ treatment
} else {
formula = otu.tab ~ treatment
}
res.F <- permanovaFL(formula = formula,
data=meta.sub,dist.method = dist_method,
scale.otu.table = FALSE, n.rarefy=100, n.rej.stop=100, seed=seed, perm.between.type = "none",
perm.within.type = "free", cluster.id = "subject_id")
## permutations: 100
## permutations: 200
## permutations: 300
## permutations: 400
## permutations: 500
## permutations: 600
## permutations: 700
## permutations: 800
## permutations: 900
## permutations: 1000
## permutations: 1100
## PERMANOVA stopped at permutation 1100
#Permutation within requires the same sie of clusters - as some of samples have failed sequencing this option is not valid.
# subject_id was added as a fixed term
p_permanovaF <- res.F$p.permanova
# -----------------------------
# dist for permanova-D2 (1-100)
# -----------------------------
n_rarefy = 100
label = colnames(otu.full)
n_sam_new = nrow(otu.tab)
dist_sq_all = array(0, dim=c(n_sam_new, n_sam_new, n_rarefy))
set.seed(seed)
for (r in 1:n_rarefy) {
otu_table_rarefy <- Rarefy(otu.tab)$otu.tab.rff
otu_table_rarefy1 <- (otu_table_rarefy>0)*1
colnames(otu_table_rarefy1) = label
colnames(otu_table_rarefy) = label
if (dist_type == 5 ) {
dist <- vegdist(x=otu_table_rarefy1, method=dist_method)
} else if (dist_type == 4) {
dist <- GUniFrac(otu_table_rarefy1, crohn.tree, alpha=c(1))$unifrac[,,"d_UW"]
}
dist <- as.matrix(dist)
if (r==1) {
dist_sq_all[,,1] = dist^2
} else {
dist_sq_all[,,r] = dist_sq_all[,,r-1] + dist^2
}
}
# --------------------------
# dist for permanova-D2-A
# --------------------------
#LDM-A: averaging the RSS terms over all rarefactions,
if (dist_type == 5 ) {
res <- jaccard.mean(otu.tab)
dist_sq_inf_o1 = res$jac.mean.sq.o1
dist_sq_inf_o2 = res$jac.mean.sq.o2
} else if (dist_type == 4) {
res <- unifrac.mean(otu.tab, crohn.tree)
dist_sq_inf_o1 = res$unifrac.mean.sq.o1
dist_sq_inf_o2 = res$unifrac.mean.sq.o2
}
#Negative controls were having issues - 0 reads after rarefying. These OTU table with zeros caused problem in data processing, therefore the Neg.s were removed from rarefaction analysis
#########################
# permanova-D2, D2-A
#########################
if (data_full==1) {
formula = dist_sq_r ~ treatment
} else {
formula = dist_sq_r ~ treatment
}
p_permanovaD2 = NULL
p_permanovaD2A2 = NULL
p_permanovaD2A1 = NULL
for (r in c(1:10, 50, 100, 100^2, 100^3)) { # "100^2" corresponds to D2-A2, "100^3" corresponds to D2-A1
cat("number of rarefaction:", r, "\n")
if (r == 100^2) {
dist_sq_r = dist_sq_inf_o2
} else if (r == 100^3) {
dist_sq_r = dist_sq_inf_o1
} else {
dist_sq_r = dist_sq_all[,,r]/r
}
res_permanovaD2 = permanovaFL(formula = formula,
data = meta.data, n.perm.max=5000,
square.dist=FALSE, n.rarefy=0, n.rej.stop=100, seed=seed,
)
if (r == 100^2) {
p_permanovaD2A2 = res_permanovaD2$p.permanova
} else if (r == 100^3) {
p_permanovaD2A1 = res_permanovaD2$p.permanova
} else {
p_permanovaD2 = c(p_permanovaD2, res_permanovaD2$p.permanova)
}
}
## number of rarefaction: 1
## permutations: 100
## PERMANOVA stopped at permutation 100
## number of rarefaction: 2
## permutations: 100
## PERMANOVA stopped at permutation 100
## number of rarefaction: 3
## permutations: 100
## PERMANOVA stopped at permutation 100
## number of rarefaction: 4
## permutations: 100
## PERMANOVA stopped at permutation 100
## number of rarefaction: 5
## permutations: 100
## PERMANOVA stopped at permutation 100
## number of rarefaction: 6
## permutations: 100
## PERMANOVA stopped at permutation 100
## number of rarefaction: 7
## permutations: 100
## PERMANOVA stopped at permutation 100
## number of rarefaction: 8
## permutations: 100
## PERMANOVA stopped at permutation 100
## number of rarefaction: 9
## permutations: 100
## PERMANOVA stopped at permutation 100
## number of rarefaction: 10
## permutations: 100
## PERMANOVA stopped at permutation 100
## number of rarefaction: 50
## permutations: 100
## PERMANOVA stopped at permutation 100
## number of rarefaction: 100
## permutations: 100
## PERMANOVA stopped at permutation 100
## number of rarefaction: 10000
## permutations: 100
## PERMANOVA stopped at permutation 100
## number of rarefaction: 1e+06
## permutations: 100
## PERMANOVA stopped at permutation 100
#tab = c(p_permanovaF, p_permanovaD2, p_permanovaD2A2, p_permanovaD2A1, i_seed)
#write.table(t(tab), file = paste("", data_full, "_dist", dist_type, ".txt", sep=""),
# quote=FALSE, append=TRUE, row.names=FALSE, col.names=FALSE)
p_permanovaF
## rarefy1 rarefy2 rarefy3 rarefy4 rarefy5 rarefy6 rarefy7
## 0.10090909 0.09909091 0.09727273 0.09909091 0.10090909 0.10000000 0.09818182
## rarefy8 rarefy9 rarefy10 rarefy11 rarefy12 rarefy13 rarefy14
## 0.09636364 0.09636364 0.09636364 0.09636364 0.09636364 0.09636364 0.09727273
## rarefy15 rarefy16 rarefy17 rarefy18 rarefy19 rarefy20 rarefy21
## 0.09727273 0.10000000 0.09909091 0.09909091 0.09909091 0.10090909 0.10000000
## rarefy22 rarefy23 rarefy24 rarefy25 rarefy26 rarefy27 rarefy28
## 0.10000000 0.10000000 0.09909091 0.10000000 0.10000000 0.10000000 0.10000000
## rarefy29 rarefy30 rarefy31 rarefy32 rarefy33 rarefy34 rarefy35
## 0.09909091 0.09909091 0.09909091 0.09909091 0.09909091 0.09909091 0.09909091
## rarefy36 rarefy37 rarefy38 rarefy39 rarefy40 rarefy41 rarefy42
## 0.09909091 0.09909091 0.09909091 0.09909091 0.09909091 0.10000000 0.10000000
## rarefy43 rarefy44 rarefy45 rarefy46 rarefy47 rarefy48 rarefy49
## 0.10000000 0.10000000 0.10000000 0.09909091 0.09909091 0.09909091 0.09909091
## rarefy50 rarefy51 rarefy52 rarefy53 rarefy54 rarefy55 rarefy56
## 0.09818182 0.09818182 0.09818182 0.09818182 0.09818182 0.09818182 0.09818182
## rarefy57 rarefy58 rarefy59 rarefy60 rarefy61 rarefy62 rarefy63
## 0.09818182 0.09818182 0.09818182 0.09818182 0.09818182 0.09818182 0.09909091
## rarefy64 rarefy65 rarefy66 rarefy67 rarefy68 rarefy69 rarefy70
## 0.09909091 0.09909091 0.09909091 0.09818182 0.09818182 0.09909091 0.09909091
## rarefy71 rarefy72 rarefy73 rarefy74 rarefy75 rarefy76 rarefy77
## 0.09909091 0.09909091 0.09909091 0.09818182 0.09818182 0.09818182 0.09909091
## rarefy78 rarefy79 rarefy80 rarefy81 rarefy82 rarefy83 rarefy84
## 0.09909091 0.09818182 0.09818182 0.09909091 0.09818182 0.09818182 0.09909091
## rarefy85 rarefy86 rarefy87 rarefy88 rarefy89 rarefy90 rarefy91
## 0.10000000 0.10000000 0.10000000 0.10000000 0.10000000 0.10000000 0.10000000
## rarefy92 rarefy93 rarefy94 rarefy95 rarefy96 rarefy97 rarefy98
## 0.10000000 0.10000000 0.10000000 0.10000000 0.10000000 0.10000000 0.10000000
## rarefy99 rarefy100
## 0.10000000 0.10000000
p_permanovaD2
## [1] 1 1 1 1 1 1 1 1 1 1 1 1
p_permanovaD2A2
## [1] 1
p_permanovaD2A1
## [1] 1
res_permanovaD2$R.squared
## [1] 0.08734907
Rarefaction stratified (Nasal swab)
As a sanity check, rarefaction based analysis was conducted using LDM
if(length(args)==0){
print("No arguments supplied")
dist_type <- 5 # 1:"horn"; 5: "jaccard"; 4: "unwt-unifrac"
data_full <- 1 # 1: full sample; 0: a subsample of 50 subjects
i_seed <- seed # seed for generating different subsamples
} else{
for(i in 1:length(args)){
eval(parse(text=args[[i]]))
}
}
## [1] "No arguments supplied"
if (dist_type==5) {
dist_method = "jaccard"
} else if (dist_type==4) {
dist_method = "unwt-unifrac"
} else if (dist_type==1) {
dist_method = "horn"
}
###############################################
# read in data
###############################################
# read in otu table, taxonomy info
#dat <- read_biom("rawData/otu_table.biom")
#otu_table <- as.data.frame(as.matrix(biom_data(dat)))
#taxonomy <- observation_metadata(dat)
#crohn.tree <- read.tree("rawData/insertion_tree.relabelled.tre")
otu_table_col <- as.data.frame(t(otu_table(phyloseq$phyloseq_count %>%
subset_samples(sample_type %in% c("Nasal"))))) #make samples rows, OTUs columns
otu_table_col <- otu_table_col[,colSums(otu_table_col>0)>0] # Removing data with zeros
# read in metadata
metadata <- sample_data(phyloseq$phyloseq_count %>%
subset_samples(sample_type %in% c("Nasal"))) %>% data.frame()
# create dichotomous variable for diagnosis
#metadata <- cbind(metadata,ifelse(metadata$diagnosis=="CD" | metadata$diagnosis=="IC" | metadata$diagnosis=="UC", 1, 0))
#colnames(metadata)[61] <- "disease"
# sort metadata to match otu table
meta.sorted <- metadata[order(match(metadata$baylor_other_id, rownames(otu_table_col))),]
# remove duplicates at the end
#meta.site.dup <- subset(meta.sorted, collection=="RISK")
#meta.sub.dup <- subset(meta.site.dup, type_sample=="biopsy" & biopsy_location=="Rectum")
# want subset with only biopsy samples, no follow-up measurements, no missing values in key covariates
#no.follow <- meta.sorted[!duplicated(meta.sorted$anonymized_name),]
#meta.site <- subset(no.follow, collection=="RISK")
#meta.sub <- subset(meta.site, type_sample=="biopsy" & biopsy_location=="Rectum")
# match otu table with cleaned meta data
otu.sub <- subset(otu_table_col, rownames(otu_table_col) %in% metadata$baylor_other_id)
###Prevalence filtration & decontam was not processed as phyloseq was alread filtered
# filter otu table
pa.table <- 1*(otu.sub > 0) #Presence absence table
#otu.sub.f <- otu.sub[, -which(colSums(pa.table)<5)] #Prevalence filtering
#dim(otu.sub)
#dim(otu.sub.f) # 16 taxa were removed
# remove obs with < 5000 reads
# remove obs with < 5000 reads
otu.full <- otu.sub[rowSums(otu.sub)>5000,] #samples with too low reads were removed
#Subsetting samples with effective reads
meta.sub <- subset(meta.sorted, meta.sorted$baylor_other_id %in% rownames(otu.full))
dim(otu.sub)
## [1] 35 45
dim(otu.full) #14 samples were thrown out
## [1] 35 45
min(rowSums(otu.full))
## [1] 173098
meta.full <- subset(meta.sub, meta.sub$baylor_other_id %in% rownames(otu.full))
#Making data of t-tests - Not usefull for my analysis
#Anyway ran for differences between control vs treatment
p.t.full = t.test(rowSums(otu.full) ~ meta.full$control, var.equal=FALSE)$p.value
p.wilcox.full = wilcox.test(rowSums(otu.full) ~ meta.full$control)$p.value
# --------------------------
# a subsample of 50 subjects
# --------------------------
set.seed(seed)
otu.sam10 <- otu.full[sort(sample(nrow(otu.full), 10)), ]
meta.sam10 <- subset(meta.sub,meta.sub$sample_name %in% rownames(otu.sam10))
#1st. We will check all the samples, non-rarefied
########################
# prepare data for anal
########################
if (data_full==1) {
otu.tab = otu.full
meta.data = meta.full
} else {
otu.tab = otu.sam10
meta.data = meta.sam10
}
dim(otu.tab)
## [1] 35 45
dim(meta.data)
## [1] 35 66
########################
# permanova-F
########################
#LDM-F: averaging the F statistics over all rarefactions,
if (data_full==1) {
formula = otu.tab ~ treatment
} else {
formula = otu.tab ~ treatment
}
res.F <- permanovaFL(formula = formula,
data=meta.sub,dist.method = dist_method,
scale.otu.table = FALSE, n.rarefy=100, n.rej.stop=100, seed=seed,
perm.between.type = "none", perm.within.type = "free", cluster.id = "subject_id")
## permutations: 100
## permutations: 200
## permutations: 300
## permutations: 400
## permutations: 500
## permutations: 600
## permutations: 700
## permutations: 800
## permutations: 900
## permutations: 1000
## permutations: 1100
## permutations: 1200
## permutations: 1300
## permutations: 1400
## permutations: 1500
## permutations: 1600
## permutations: 1700
## permutations: 1800
## permutations: 1900
## permutations: 2000
## permutations: 2100
## permutations: 2200
## permutations: 2300
## permutations: 2400
## permutations: 2500
## permutations: 2600
## permutations: 2700
## permutations: 2800
## permutations: 2900
## permutations: 3000
## permutations: 3100
## permutations: 3200
## permutations: 3300
## permutations: 3400
## permutations: 3500
## permutations: 3600
## permutations: 3700
## permutations: 3800
## permutations: 3900
## permutations: 4000
## permutations: 4100
## permutations: 4200
## permutations: 4300
## permutations: 4400
## permutations: 4500
## permutations: 4600
## permutations: 4700
## permutations: 4800
## permutations: 4900
## permutations: 5000
#Permutation within requires the same sie of clusters - as some of samples have failed sequencing this option is not valid.
# subject_id was added as a fixed term
p_permanovaF <- res.F$p.permanova
# -----------------------------
# dist for permanova-D2 (1-100)
# -----------------------------
n_rarefy = 100
label = colnames(otu.full)
n_sam_new = nrow(otu.tab)
dist_sq_all = array(0, dim=c(n_sam_new, n_sam_new, n_rarefy))
set.seed(seed)
for (r in 1:n_rarefy) {
otu_table_rarefy <- Rarefy(otu.tab)$otu.tab.rff
otu_table_rarefy1 <- (otu_table_rarefy>0)*1
colnames(otu_table_rarefy1) = label
colnames(otu_table_rarefy) = label
if (dist_type == 5 ) {
dist <- vegdist(x=otu_table_rarefy1, method=dist_method)
} else if (dist_type == 4) {
dist <- GUniFrac(otu_table_rarefy1, crohn.tree, alpha=c(1))$unifrac[,,"d_UW"]
}
dist <- as.matrix(dist)
if (r==1) {
dist_sq_all[,,1] = dist^2
} else {
dist_sq_all[,,r] = dist_sq_all[,,r-1] + dist^2
}
}
# --------------------------
# dist for permanova-D2-A
# --------------------------
#LDM-A: averaging the RSS terms over all rarefactions,
if (dist_type == 5 ) {
res <- jaccard.mean(otu.tab)
dist_sq_inf_o1 = res$jac.mean.sq.o1
dist_sq_inf_o2 = res$jac.mean.sq.o2
} else if (dist_type == 4) {
res <- unifrac.mean(otu.tab, crohn.tree)
dist_sq_inf_o1 = res$unifrac.mean.sq.o1
dist_sq_inf_o2 = res$unifrac.mean.sq.o2
}
#Negative controls were having issues - 0 reads after rarefying. These OTU table with zeros caused problem in data processing, therefore the Neg.s were removed from rarefaction analysis
#########################
# permanova-D2, D2-A
#########################
if (data_full==1) {
formula = dist_sq_r ~ treatment
} else {
formula = dist_sq_r ~ treatment
}
p_permanovaD2 = NULL
p_permanovaD2A2 = NULL
p_permanovaD2A1 = NULL
for (r in c(1:10, 50, 100, 100^2, 100^3)) { # "100^2" corresponds to D2-A2, "100^3" corresponds to D2-A1
cat("number of rarefaction:", r, "\n")
if (r == 100^2) {
dist_sq_r = dist_sq_inf_o2
} else if (r == 100^3) {
dist_sq_r = dist_sq_inf_o1
} else {
dist_sq_r = dist_sq_all[,,r]/r
}
res_permanovaD2 = permanovaFL(formula = formula,
data = meta.data, n.perm.max=5000,
square.dist=FALSE, n.rarefy=0, n.rej.stop=100, seed=seed,
perm.between.type = "none", perm.within.type = "free", cluster.id = "subject_id")
if (r == 100^2) {
p_permanovaD2A2 = res_permanovaD2$p.permanova
} else if (r == 100^3) {
p_permanovaD2A1 = res_permanovaD2$p.permanova
} else {
p_permanovaD2 = c(p_permanovaD2, res_permanovaD2$p.permanova)
}
}
## number of rarefaction: 1
## permutations: 100
## permutations: 200
## permutations: 300
## permutations: 400
## permutations: 500
## permutations: 600
## permutations: 700
## permutations: 800
## permutations: 900
## permutations: 1000
## permutations: 1100
## permutations: 1200
## permutations: 1300
## permutations: 1400
## permutations: 1500
## permutations: 1600
## permutations: 1700
## permutations: 1800
## permutations: 1900
## permutations: 2000
## permutations: 2100
## permutations: 2200
## permutations: 2300
## permutations: 2400
## permutations: 2500
## permutations: 2600
## permutations: 2700
## permutations: 2800
## permutations: 2900
## permutations: 3000
## permutations: 3100
## permutations: 3200
## permutations: 3300
## permutations: 3400
## permutations: 3500
## permutations: 3600
## permutations: 3700
## permutations: 3800
## permutations: 3900
## permutations: 4000
## permutations: 4100
## permutations: 4200
## permutations: 4300
## permutations: 4400
## permutations: 4500
## permutations: 4600
## permutations: 4700
## permutations: 4800
## permutations: 4900
## permutations: 5000
## number of rarefaction: 2
## permutations: 100
## permutations: 200
## permutations: 300
## permutations: 400
## permutations: 500
## permutations: 600
## permutations: 700
## permutations: 800
## permutations: 900
## permutations: 1000
## permutations: 1100
## permutations: 1200
## permutations: 1300
## permutations: 1400
## permutations: 1500
## permutations: 1600
## permutations: 1700
## permutations: 1800
## permutations: 1900
## permutations: 2000
## permutations: 2100
## permutations: 2200
## permutations: 2300
## permutations: 2400
## permutations: 2500
## permutations: 2600
## permutations: 2700
## permutations: 2800
## permutations: 2900
## permutations: 3000
## permutations: 3100
## permutations: 3200
## permutations: 3300
## permutations: 3400
## permutations: 3500
## permutations: 3600
## permutations: 3700
## permutations: 3800
## permutations: 3900
## permutations: 4000
## permutations: 4100
## permutations: 4200
## permutations: 4300
## permutations: 4400
## permutations: 4500
## permutations: 4600
## permutations: 4700
## permutations: 4800
## permutations: 4900
## permutations: 5000
## number of rarefaction: 3
## permutations: 100
## permutations: 200
## permutations: 300
## permutations: 400
## permutations: 500
## permutations: 600
## permutations: 700
## permutations: 800
## permutations: 900
## permutations: 1000
## permutations: 1100
## permutations: 1200
## permutations: 1300
## permutations: 1400
## permutations: 1500
## permutations: 1600
## permutations: 1700
## permutations: 1800
## permutations: 1900
## permutations: 2000
## permutations: 2100
## permutations: 2200
## permutations: 2300
## permutations: 2400
## permutations: 2500
## permutations: 2600
## permutations: 2700
## permutations: 2800
## permutations: 2900
## permutations: 3000
## permutations: 3100
## permutations: 3200
## permutations: 3300
## permutations: 3400
## permutations: 3500
## permutations: 3600
## permutations: 3700
## permutations: 3800
## permutations: 3900
## permutations: 4000
## permutations: 4100
## permutations: 4200
## permutations: 4300
## permutations: 4400
## permutations: 4500
## permutations: 4600
## permutations: 4700
## permutations: 4800
## permutations: 4900
## permutations: 5000
## number of rarefaction: 4
## permutations: 100
## permutations: 200
## permutations: 300
## permutations: 400
## permutations: 500
## permutations: 600
## permutations: 700
## permutations: 800
## permutations: 900
## permutations: 1000
## permutations: 1100
## permutations: 1200
## permutations: 1300
## permutations: 1400
## permutations: 1500
## permutations: 1600
## permutations: 1700
## permutations: 1800
## permutations: 1900
## permutations: 2000
## permutations: 2100
## permutations: 2200
## permutations: 2300
## permutations: 2400
## permutations: 2500
## permutations: 2600
## permutations: 2700
## permutations: 2800
## permutations: 2900
## permutations: 3000
## permutations: 3100
## permutations: 3200
## permutations: 3300
## permutations: 3400
## permutations: 3500
## permutations: 3600
## permutations: 3700
## permutations: 3800
## permutations: 3900
## permutations: 4000
## permutations: 4100
## permutations: 4200
## permutations: 4300
## permutations: 4400
## permutations: 4500
## permutations: 4600
## permutations: 4700
## permutations: 4800
## permutations: 4900
## permutations: 5000
## number of rarefaction: 5
## permutations: 100
## permutations: 200
## permutations: 300
## permutations: 400
## permutations: 500
## permutations: 600
## permutations: 700
## permutations: 800
## permutations: 900
## permutations: 1000
## permutations: 1100
## permutations: 1200
## permutations: 1300
## permutations: 1400
## permutations: 1500
## permutations: 1600
## permutations: 1700
## permutations: 1800
## permutations: 1900
## permutations: 2000
## permutations: 2100
## permutations: 2200
## permutations: 2300
## permutations: 2400
## permutations: 2500
## permutations: 2600
## permutations: 2700
## permutations: 2800
## permutations: 2900
## permutations: 3000
## permutations: 3100
## permutations: 3200
## permutations: 3300
## permutations: 3400
## permutations: 3500
## permutations: 3600
## permutations: 3700
## permutations: 3800
## permutations: 3900
## permutations: 4000
## permutations: 4100
## permutations: 4200
## permutations: 4300
## permutations: 4400
## permutations: 4500
## permutations: 4600
## permutations: 4700
## permutations: 4800
## permutations: 4900
## permutations: 5000
## number of rarefaction: 6
## permutations: 100
## permutations: 200
## permutations: 300
## permutations: 400
## permutations: 500
## permutations: 600
## permutations: 700
## permutations: 800
## permutations: 900
## permutations: 1000
## permutations: 1100
## permutations: 1200
## permutations: 1300
## permutations: 1400
## permutations: 1500
## permutations: 1600
## permutations: 1700
## permutations: 1800
## permutations: 1900
## permutations: 2000
## permutations: 2100
## permutations: 2200
## permutations: 2300
## permutations: 2400
## permutations: 2500
## permutations: 2600
## permutations: 2700
## permutations: 2800
## permutations: 2900
## permutations: 3000
## permutations: 3100
## permutations: 3200
## permutations: 3300
## permutations: 3400
## permutations: 3500
## permutations: 3600
## permutations: 3700
## permutations: 3800
## permutations: 3900
## permutations: 4000
## permutations: 4100
## permutations: 4200
## permutations: 4300
## permutations: 4400
## permutations: 4500
## permutations: 4600
## permutations: 4700
## permutations: 4800
## permutations: 4900
## permutations: 5000
## number of rarefaction: 7
## permutations: 100
## permutations: 200
## permutations: 300
## permutations: 400
## permutations: 500
## permutations: 600
## permutations: 700
## permutations: 800
## permutations: 900
## permutations: 1000
## permutations: 1100
## permutations: 1200
## permutations: 1300
## permutations: 1400
## permutations: 1500
## permutations: 1600
## permutations: 1700
## permutations: 1800
## permutations: 1900
## permutations: 2000
## permutations: 2100
## permutations: 2200
## permutations: 2300
## permutations: 2400
## permutations: 2500
## permutations: 2600
## permutations: 2700
## permutations: 2800
## permutations: 2900
## permutations: 3000
## permutations: 3100
## permutations: 3200
## permutations: 3300
## permutations: 3400
## permutations: 3500
## permutations: 3600
## permutations: 3700
## permutations: 3800
## permutations: 3900
## permutations: 4000
## permutations: 4100
## permutations: 4200
## permutations: 4300
## permutations: 4400
## permutations: 4500
## permutations: 4600
## permutations: 4700
## permutations: 4800
## permutations: 4900
## permutations: 5000
## number of rarefaction: 8
## permutations: 100
## permutations: 200
## permutations: 300
## permutations: 400
## permutations: 500
## permutations: 600
## permutations: 700
## permutations: 800
## permutations: 900
## permutations: 1000
## permutations: 1100
## permutations: 1200
## permutations: 1300
## permutations: 1400
## permutations: 1500
## permutations: 1600
## permutations: 1700
## permutations: 1800
## permutations: 1900
## permutations: 2000
## permutations: 2100
## permutations: 2200
## permutations: 2300
## permutations: 2400
## permutations: 2500
## permutations: 2600
## permutations: 2700
## permutations: 2800
## permutations: 2900
## permutations: 3000
## permutations: 3100
## permutations: 3200
## permutations: 3300
## permutations: 3400
## permutations: 3500
## permutations: 3600
## permutations: 3700
## permutations: 3800
## permutations: 3900
## permutations: 4000
## permutations: 4100
## permutations: 4200
## permutations: 4300
## permutations: 4400
## permutations: 4500
## permutations: 4600
## permutations: 4700
## permutations: 4800
## permutations: 4900
## permutations: 5000
## number of rarefaction: 9
## permutations: 100
## permutations: 200
## permutations: 300
## permutations: 400
## permutations: 500
## permutations: 600
## permutations: 700
## permutations: 800
## permutations: 900
## permutations: 1000
## permutations: 1100
## permutations: 1200
## permutations: 1300
## permutations: 1400
## permutations: 1500
## permutations: 1600
## permutations: 1700
## permutations: 1800
## permutations: 1900
## permutations: 2000
## permutations: 2100
## permutations: 2200
## permutations: 2300
## permutations: 2400
## permutations: 2500
## permutations: 2600
## permutations: 2700
## permutations: 2800
## permutations: 2900
## permutations: 3000
## permutations: 3100
## permutations: 3200
## permutations: 3300
## permutations: 3400
## permutations: 3500
## permutations: 3600
## permutations: 3700
## permutations: 3800
## permutations: 3900
## permutations: 4000
## permutations: 4100
## permutations: 4200
## permutations: 4300
## permutations: 4400
## permutations: 4500
## permutations: 4600
## permutations: 4700
## permutations: 4800
## permutations: 4900
## permutations: 5000
## number of rarefaction: 10
## permutations: 100
## permutations: 200
## permutations: 300
## permutations: 400
## permutations: 500
## permutations: 600
## permutations: 700
## permutations: 800
## permutations: 900
## permutations: 1000
## permutations: 1100
## permutations: 1200
## permutations: 1300
## permutations: 1400
## permutations: 1500
## permutations: 1600
## permutations: 1700
## permutations: 1800
## permutations: 1900
## permutations: 2000
## permutations: 2100
## permutations: 2200
## permutations: 2300
## permutations: 2400
## permutations: 2500
## permutations: 2600
## permutations: 2700
## permutations: 2800
## permutations: 2900
## permutations: 3000
## permutations: 3100
## permutations: 3200
## permutations: 3300
## permutations: 3400
## permutations: 3500
## permutations: 3600
## permutations: 3700
## permutations: 3800
## permutations: 3900
## permutations: 4000
## permutations: 4100
## permutations: 4200
## permutations: 4300
## permutations: 4400
## permutations: 4500
## permutations: 4600
## permutations: 4700
## permutations: 4800
## permutations: 4900
## permutations: 5000
## number of rarefaction: 50
## permutations: 100
## permutations: 200
## permutations: 300
## permutations: 400
## permutations: 500
## permutations: 600
## permutations: 700
## permutations: 800
## permutations: 900
## permutations: 1000
## permutations: 1100
## permutations: 1200
## permutations: 1300
## permutations: 1400
## permutations: 1500
## permutations: 1600
## permutations: 1700
## permutations: 1800
## permutations: 1900
## permutations: 2000
## permutations: 2100
## permutations: 2200
## permutations: 2300
## permutations: 2400
## permutations: 2500
## permutations: 2600
## permutations: 2700
## permutations: 2800
## permutations: 2900
## permutations: 3000
## permutations: 3100
## permutations: 3200
## permutations: 3300
## permutations: 3400
## permutations: 3500
## permutations: 3600
## permutations: 3700
## permutations: 3800
## permutations: 3900
## permutations: 4000
## permutations: 4100
## permutations: 4200
## permutations: 4300
## permutations: 4400
## permutations: 4500
## permutations: 4600
## permutations: 4700
## permutations: 4800
## permutations: 4900
## permutations: 5000
## number of rarefaction: 100
## permutations: 100
## permutations: 200
## permutations: 300
## permutations: 400
## permutations: 500
## permutations: 600
## permutations: 700
## permutations: 800
## permutations: 900
## permutations: 1000
## permutations: 1100
## permutations: 1200
## permutations: 1300
## permutations: 1400
## permutations: 1500
## permutations: 1600
## permutations: 1700
## permutations: 1800
## permutations: 1900
## permutations: 2000
## permutations: 2100
## permutations: 2200
## permutations: 2300
## permutations: 2400
## permutations: 2500
## permutations: 2600
## permutations: 2700
## permutations: 2800
## permutations: 2900
## permutations: 3000
## permutations: 3100
## permutations: 3200
## permutations: 3300
## permutations: 3400
## permutations: 3500
## permutations: 3600
## permutations: 3700
## permutations: 3800
## permutations: 3900
## permutations: 4000
## permutations: 4100
## permutations: 4200
## permutations: 4300
## permutations: 4400
## permutations: 4500
## permutations: 4600
## permutations: 4700
## permutations: 4800
## permutations: 4900
## permutations: 5000
## number of rarefaction: 10000
## permutations: 100
## permutations: 200
## permutations: 300
## permutations: 400
## permutations: 500
## permutations: 600
## permutations: 700
## permutations: 800
## permutations: 900
## permutations: 1000
## permutations: 1100
## permutations: 1200
## permutations: 1300
## permutations: 1400
## permutations: 1500
## permutations: 1600
## permutations: 1700
## permutations: 1800
## permutations: 1900
## permutations: 2000
## permutations: 2100
## permutations: 2200
## permutations: 2300
## permutations: 2400
## permutations: 2500
## permutations: 2600
## permutations: 2700
## permutations: 2800
## permutations: 2900
## permutations: 3000
## permutations: 3100
## permutations: 3200
## permutations: 3300
## permutations: 3400
## permutations: 3500
## permutations: 3600
## permutations: 3700
## permutations: 3800
## permutations: 3900
## permutations: 4000
## permutations: 4100
## permutations: 4200
## permutations: 4300
## permutations: 4400
## permutations: 4500
## permutations: 4600
## permutations: 4700
## permutations: 4800
## permutations: 4900
## permutations: 5000
## number of rarefaction: 1e+06
## permutations: 100
## permutations: 200
## permutations: 300
## permutations: 400
## permutations: 500
## permutations: 600
## permutations: 700
## permutations: 800
## permutations: 900
## permutations: 1000
## permutations: 1100
## permutations: 1200
## permutations: 1300
## permutations: 1400
## permutations: 1500
## permutations: 1600
## permutations: 1700
## permutations: 1800
## permutations: 1900
## permutations: 2000
## permutations: 2100
## permutations: 2200
## permutations: 2300
## permutations: 2400
## permutations: 2500
## permutations: 2600
## permutations: 2700
## permutations: 2800
## permutations: 2900
## permutations: 3000
## permutations: 3100
## permutations: 3200
## permutations: 3300
## permutations: 3400
## permutations: 3500
## permutations: 3600
## permutations: 3700
## permutations: 3800
## permutations: 3900
## permutations: 4000
## permutations: 4100
## permutations: 4200
## permutations: 4300
## permutations: 4400
## permutations: 4500
## permutations: 4600
## permutations: 4700
## permutations: 4800
## permutations: 4900
## permutations: 5000
#tab = c(p_permanovaF, p_permanovaD2, p_permanovaD2A2, p_permanovaD2A1, i_seed)
#write.table(t(tab), file = paste("", data_full, "_dist", dist_type, ".txt", sep=""),
# quote=FALSE, append=TRUE, row.names=FALSE, col.names=FALSE)
p_permanovaF
## rarefy1 rarefy2 rarefy3 rarefy4 rarefy5 rarefy6 rarefy7
## 0.00119976 0.00119976 0.00119976 0.00119976 0.00119976 0.00119976 0.00119976
## rarefy8 rarefy9 rarefy10 rarefy11 rarefy12 rarefy13 rarefy14
## 0.00119976 0.00119976 0.00119976 0.00119976 0.00119976 0.00119976 0.00119976
## rarefy15 rarefy16 rarefy17 rarefy18 rarefy19 rarefy20 rarefy21
## 0.00119976 0.00119976 0.00119976 0.00119976 0.00119976 0.00119976 0.00119976
## rarefy22 rarefy23 rarefy24 rarefy25 rarefy26 rarefy27 rarefy28
## 0.00119976 0.00119976 0.00119976 0.00119976 0.00119976 0.00119976 0.00119976
## rarefy29 rarefy30 rarefy31 rarefy32 rarefy33 rarefy34 rarefy35
## 0.00119976 0.00119976 0.00119976 0.00119976 0.00119976 0.00119976 0.00119976
## rarefy36 rarefy37 rarefy38 rarefy39 rarefy40 rarefy41 rarefy42
## 0.00119976 0.00119976 0.00119976 0.00119976 0.00119976 0.00119976 0.00119976
## rarefy43 rarefy44 rarefy45 rarefy46 rarefy47 rarefy48 rarefy49
## 0.00119976 0.00119976 0.00119976 0.00119976 0.00119976 0.00119976 0.00119976
## rarefy50 rarefy51 rarefy52 rarefy53 rarefy54 rarefy55 rarefy56
## 0.00119976 0.00119976 0.00119976 0.00119976 0.00119976 0.00119976 0.00119976
## rarefy57 rarefy58 rarefy59 rarefy60 rarefy61 rarefy62 rarefy63
## 0.00119976 0.00119976 0.00119976 0.00119976 0.00119976 0.00119976 0.00119976
## rarefy64 rarefy65 rarefy66 rarefy67 rarefy68 rarefy69 rarefy70
## 0.00119976 0.00119976 0.00119976 0.00119976 0.00119976 0.00119976 0.00119976
## rarefy71 rarefy72 rarefy73 rarefy74 rarefy75 rarefy76 rarefy77
## 0.00119976 0.00119976 0.00119976 0.00119976 0.00119976 0.00119976 0.00119976
## rarefy78 rarefy79 rarefy80 rarefy81 rarefy82 rarefy83 rarefy84
## 0.00119976 0.00119976 0.00119976 0.00119976 0.00119976 0.00119976 0.00119976
## rarefy85 rarefy86 rarefy87 rarefy88 rarefy89 rarefy90 rarefy91
## 0.00119976 0.00119976 0.00119976 0.00119976 0.00119976 0.00119976 0.00119976
## rarefy92 rarefy93 rarefy94 rarefy95 rarefy96 rarefy97 rarefy98
## 0.00119976 0.00119976 0.00119976 0.00119976 0.00119976 0.00119976 0.00119976
## rarefy99 rarefy100
## 0.00119976 0.00119976
p_permanovaD2
## [1] 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996
## [7] 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996
p_permanovaD2A2
## [1] 0.00019996
p_permanovaD2A1
## [1] 0.00019996
res_permanovaD2$R.squared
## [1] 0.3094375
Rarefaction stratified (Sputum)
As a sanity check, rarefaction based analysis was conducted using LDM
if(length(args)==0){
print("No arguments supplied")
dist_type <- 5 # 1:"horn"; 5: "jaccard"; 4: "unwt-unifrac"
data_full <- 1 # 1: full sample; 0: a subsample of 50 subjects
i_seed <- seed # seed for generating different subsamples
} else{
for(i in 1:length(args)){
eval(parse(text=args[[i]]))
}
}
## [1] "No arguments supplied"
if (dist_type==5) {
dist_method = "jaccard"
} else if (dist_type==4) {
dist_method = "unwt-unifrac"
} else if (dist_type==1) {
dist_method = "horn"
}
###############################################
# read in data
###############################################
# read in otu table, taxonomy info
#dat <- read_biom("rawData/otu_table.biom")
#otu_table <- as.data.frame(as.matrix(biom_data(dat)))
#taxonomy <- observation_metadata(dat)
#crohn.tree <- read.tree("rawData/insertion_tree.relabelled.tre")
otu_table_col <- as.data.frame(t(otu_table(phyloseq$phyloseq_count %>%
subset_samples(sample_type %in% c("Sputum"))))) #make samples rows, OTUs columns
otu_table_col <- otu_table_col[,colSums(otu_table_col>0)>0] # Removing data with zeros
# read in metadata
metadata <- sample_data(phyloseq$phyloseq_count %>%
subset_samples(sample_type %in% c("Sputum"))) %>% data.frame()
# create dichotomous variable for diagnosis
#metadata <- cbind(metadata,ifelse(metadata$diagnosis=="CD" | metadata$diagnosis=="IC" | metadata$diagnosis=="UC", 1, 0))
#colnames(metadata)[61] <- "disease"
# sort metadata to match otu table
meta.sorted <- metadata[order(match(metadata$baylor_other_id, rownames(otu_table_col))),]
# remove duplicates at the end
#meta.site.dup <- subset(meta.sorted, collection=="RISK")
#meta.sub.dup <- subset(meta.site.dup, type_sample=="biopsy" & biopsy_location=="Rectum")
# want subset with only biopsy samples, no follow-up measurements, no missing values in key covariates
#no.follow <- meta.sorted[!duplicated(meta.sorted$anonymized_name),]
#meta.site <- subset(no.follow, collection=="RISK")
#meta.sub <- subset(meta.site, type_sample=="biopsy" & biopsy_location=="Rectum")
# match otu table with cleaned meta data
otu.sub <- subset(otu_table_col, rownames(otu_table_col) %in% metadata$baylor_other_id)
###Prevalence filtration & decontam was not processed as phyloseq was alread filtered
# filter otu table
pa.table <- 1*(otu.sub > 0) #Presence absence table
#otu.sub.f <- otu.sub[, -which(colSums(pa.table)<5)] #Prevalence filtering
#dim(otu.sub)
#dim(otu.sub.f) # 16 taxa were removed
# remove obs with < 5000 reads
# remove obs with < 5000 reads
otu.full <- otu.sub[rowSums(otu.sub)>5000,] #samples with too low reads were removed
#Subsetting samples with effective reads
meta.sub <- subset(meta.sorted, meta.sorted$baylor_other_id %in% rownames(otu.full))
dim(otu.sub)
## [1] 30 165
dim(otu.full) #14 samples were thrown out
## [1] 30 165
min(rowSums(otu.full))
## [1] 40700
meta.full <- subset(meta.sub, meta.sub$baylor_other_id %in% rownames(otu.full))
#Making data of t-tests - Not usefull for my analysis
#Anyway ran for differences between control vs treatment
p.t.full = t.test(rowSums(otu.full) ~ meta.full$control, var.equal=FALSE)$p.value
p.wilcox.full = wilcox.test(rowSums(otu.full) ~ meta.full$control)$p.value
# --------------------------
# a subsample of 50 subjects
# --------------------------
set.seed(seed)
otu.sam10 <- otu.full[sort(sample(nrow(otu.full), 10)), ]
meta.sam10 <- subset(meta.sub,meta.sub$sample_name %in% rownames(otu.sam10))
#1st. We will check all the samples, non-rarefied
########################
# prepare data for anal
########################
if (data_full==1) {
otu.tab = otu.full
meta.data = meta.full
} else {
otu.tab = otu.sam10
meta.data = meta.sam10
}
dim(otu.tab)
## [1] 30 165
dim(meta.data)
## [1] 30 66
########################
# permanova-F
########################
#LDM-F: averaging the F statistics over all rarefactions,
if (data_full==1) {
formula = otu.tab ~ treatment
} else {
formula = otu.tab ~ sample_type + treatment
}
res.F <- permanovaFL(formula = formula,
data=meta.sub,dist.method = dist_method,
scale.otu.table = FALSE, n.rarefy=100, n.rej.stop=100, seed=seed, perm.between.type = "none",
perm.within.type = "free", cluster.id = "subject_id")
## permutations: 100
## permutations: 200
## permutations: 300
## permutations: 400
## permutations: 500
## permutations: 600
## permutations: 700
## permutations: 800
## permutations: 900
## permutations: 1000
## permutations: 1100
## permutations: 1200
## permutations: 1300
## permutations: 1400
## permutations: 1500
## permutations: 1600
## permutations: 1700
## permutations: 1800
## permutations: 1900
## permutations: 2000
## permutations: 2100
## permutations: 2200
## permutations: 2300
## permutations: 2400
## permutations: 2500
## permutations: 2600
## permutations: 2700
## permutations: 2800
## permutations: 2900
## permutations: 3000
## permutations: 3100
## permutations: 3200
## permutations: 3300
## permutations: 3400
## permutations: 3500
## permutations: 3600
## permutations: 3700
## permutations: 3800
## permutations: 3900
## permutations: 4000
## permutations: 4100
## permutations: 4200
## permutations: 4300
## permutations: 4400
## permutations: 4500
## permutations: 4600
## permutations: 4700
## permutations: 4800
## permutations: 4900
## permutations: 5000
#Permutation within requires the same sie of clusters - as some of samples have failed sequencing this option is not valid.
# subject_id was added as a fixed term
p_permanovaF <- res.F$p.permanova
# -----------------------------
# dist for permanova-D2 (1-100)
# -----------------------------
n_rarefy = 100
label = colnames(otu.full)
n_sam_new = nrow(otu.tab)
dist_sq_all = array(0, dim=c(n_sam_new, n_sam_new, n_rarefy))
set.seed(seed)
for (r in 1:n_rarefy) {
otu_table_rarefy <- Rarefy(otu.tab)$otu.tab.rff
otu_table_rarefy1 <- (otu_table_rarefy>0)*1
colnames(otu_table_rarefy1) = label
colnames(otu_table_rarefy) = label
if (dist_type == 5 ) {
dist <- vegdist(x=otu_table_rarefy1, method=dist_method)
} else if (dist_type == 4) {
dist <- GUniFrac(otu_table_rarefy1, crohn.tree, alpha=c(1))$unifrac[,,"d_UW"]
}
dist <- as.matrix(dist)
if (r==1) {
dist_sq_all[,,1] = dist^2
} else {
dist_sq_all[,,r] = dist_sq_all[,,r-1] + dist^2
}
}
# --------------------------
# dist for permanova-D2-A
# --------------------------
#LDM-A: averaging the RSS terms over all rarefactions,
if (dist_type == 5 ) {
res <- jaccard.mean(otu.tab)
dist_sq_inf_o1 = res$jac.mean.sq.o1
dist_sq_inf_o2 = res$jac.mean.sq.o2
} else if (dist_type == 4) {
res <- unifrac.mean(otu.tab, crohn.tree)
dist_sq_inf_o1 = res$unifrac.mean.sq.o1
dist_sq_inf_o2 = res$unifrac.mean.sq.o2
}
#Negative controls were having issues - 0 reads after rarefying. These OTU table with zeros caused problem in data processing, therefore the Neg.s were removed from rarefaction analysis
#########################
# permanova-D2, D2-A
#########################
if (data_full==1) {
formula = dist_sq_r ~ treatment
} else {
formula = dist_sq_r | sex ~ disease
}
p_permanovaD2 = NULL
p_permanovaD2A2 = NULL
p_permanovaD2A1 = NULL
for (r in c(1:10, 50, 100, 100^2, 100^3)) { # "100^2" corresponds to D2-A2, "100^3" corresponds to D2-A1
cat("number of rarefaction:", r, "\n")
if (r == 100^2) {
dist_sq_r = dist_sq_inf_o2
} else if (r == 100^3) {
dist_sq_r = dist_sq_inf_o1
} else {
dist_sq_r = dist_sq_all[,,r]/r
}
res_permanovaD2 = permanovaFL(formula = formula,
data = meta.data, n.perm.max=5000,
square.dist=FALSE, n.rarefy=0, n.rej.stop=100, seed=seed,
perm.between.type = "none", perm.within.type = "free", cluster.id = "subject_id")
if (r == 100^2) {
p_permanovaD2A2 = res_permanovaD2$p.permanova
} else if (r == 100^3) {
p_permanovaD2A1 = res_permanovaD2$p.permanova
} else {
p_permanovaD2 = c(p_permanovaD2, res_permanovaD2$p.permanova)
}
}
## number of rarefaction: 1
## permutations: 100
## permutations: 200
## permutations: 300
## permutations: 400
## permutations: 500
## permutations: 600
## permutations: 700
## permutations: 800
## permutations: 900
## permutations: 1000
## permutations: 1100
## permutations: 1200
## permutations: 1300
## permutations: 1400
## permutations: 1500
## permutations: 1600
## permutations: 1700
## permutations: 1800
## permutations: 1900
## permutations: 2000
## permutations: 2100
## permutations: 2200
## permutations: 2300
## permutations: 2400
## permutations: 2500
## permutations: 2600
## permutations: 2700
## permutations: 2800
## permutations: 2900
## permutations: 3000
## permutations: 3100
## permutations: 3200
## permutations: 3300
## permutations: 3400
## permutations: 3500
## permutations: 3600
## permutations: 3700
## permutations: 3800
## permutations: 3900
## permutations: 4000
## permutations: 4100
## permutations: 4200
## permutations: 4300
## permutations: 4400
## permutations: 4500
## permutations: 4600
## permutations: 4700
## permutations: 4800
## permutations: 4900
## permutations: 5000
## number of rarefaction: 2
## permutations: 100
## permutations: 200
## permutations: 300
## permutations: 400
## permutations: 500
## permutations: 600
## permutations: 700
## permutations: 800
## permutations: 900
## permutations: 1000
## permutations: 1100
## permutations: 1200
## permutations: 1300
## permutations: 1400
## permutations: 1500
## permutations: 1600
## permutations: 1700
## permutations: 1800
## permutations: 1900
## permutations: 2000
## permutations: 2100
## permutations: 2200
## permutations: 2300
## permutations: 2400
## permutations: 2500
## permutations: 2600
## permutations: 2700
## permutations: 2800
## permutations: 2900
## permutations: 3000
## permutations: 3100
## permutations: 3200
## permutations: 3300
## permutations: 3400
## permutations: 3500
## permutations: 3600
## permutations: 3700
## permutations: 3800
## permutations: 3900
## permutations: 4000
## permutations: 4100
## permutations: 4200
## permutations: 4300
## permutations: 4400
## permutations: 4500
## permutations: 4600
## permutations: 4700
## permutations: 4800
## permutations: 4900
## permutations: 5000
## number of rarefaction: 3
## permutations: 100
## permutations: 200
## permutations: 300
## permutations: 400
## permutations: 500
## permutations: 600
## permutations: 700
## permutations: 800
## permutations: 900
## permutations: 1000
## permutations: 1100
## permutations: 1200
## permutations: 1300
## permutations: 1400
## permutations: 1500
## permutations: 1600
## permutations: 1700
## permutations: 1800
## permutations: 1900
## permutations: 2000
## permutations: 2100
## permutations: 2200
## permutations: 2300
## permutations: 2400
## permutations: 2500
## permutations: 2600
## permutations: 2700
## permutations: 2800
## permutations: 2900
## permutations: 3000
## permutations: 3100
## permutations: 3200
## permutations: 3300
## permutations: 3400
## permutations: 3500
## permutations: 3600
## permutations: 3700
## permutations: 3800
## permutations: 3900
## permutations: 4000
## permutations: 4100
## permutations: 4200
## permutations: 4300
## permutations: 4400
## permutations: 4500
## permutations: 4600
## permutations: 4700
## permutations: 4800
## permutations: 4900
## permutations: 5000
## number of rarefaction: 4
## permutations: 100
## permutations: 200
## permutations: 300
## permutations: 400
## permutations: 500
## permutations: 600
## permutations: 700
## permutations: 800
## permutations: 900
## permutations: 1000
## permutations: 1100
## permutations: 1200
## permutations: 1300
## permutations: 1400
## permutations: 1500
## permutations: 1600
## permutations: 1700
## permutations: 1800
## permutations: 1900
## permutations: 2000
## permutations: 2100
## permutations: 2200
## permutations: 2300
## permutations: 2400
## permutations: 2500
## permutations: 2600
## permutations: 2700
## permutations: 2800
## permutations: 2900
## permutations: 3000
## permutations: 3100
## permutations: 3200
## permutations: 3300
## permutations: 3400
## permutations: 3500
## permutations: 3600
## permutations: 3700
## permutations: 3800
## permutations: 3900
## permutations: 4000
## permutations: 4100
## permutations: 4200
## permutations: 4300
## permutations: 4400
## permutations: 4500
## permutations: 4600
## permutations: 4700
## permutations: 4800
## permutations: 4900
## permutations: 5000
## number of rarefaction: 5
## permutations: 100
## permutations: 200
## permutations: 300
## permutations: 400
## permutations: 500
## permutations: 600
## permutations: 700
## permutations: 800
## permutations: 900
## permutations: 1000
## permutations: 1100
## permutations: 1200
## permutations: 1300
## permutations: 1400
## permutations: 1500
## permutations: 1600
## permutations: 1700
## permutations: 1800
## permutations: 1900
## permutations: 2000
## permutations: 2100
## permutations: 2200
## permutations: 2300
## permutations: 2400
## permutations: 2500
## permutations: 2600
## permutations: 2700
## permutations: 2800
## permutations: 2900
## permutations: 3000
## permutations: 3100
## permutations: 3200
## permutations: 3300
## permutations: 3400
## permutations: 3500
## permutations: 3600
## permutations: 3700
## permutations: 3800
## permutations: 3900
## permutations: 4000
## permutations: 4100
## permutations: 4200
## permutations: 4300
## permutations: 4400
## permutations: 4500
## permutations: 4600
## permutations: 4700
## permutations: 4800
## permutations: 4900
## permutations: 5000
## number of rarefaction: 6
## permutations: 100
## permutations: 200
## permutations: 300
## permutations: 400
## permutations: 500
## permutations: 600
## permutations: 700
## permutations: 800
## permutations: 900
## permutations: 1000
## permutations: 1100
## permutations: 1200
## permutations: 1300
## permutations: 1400
## permutations: 1500
## permutations: 1600
## permutations: 1700
## permutations: 1800
## permutations: 1900
## permutations: 2000
## permutations: 2100
## permutations: 2200
## permutations: 2300
## permutations: 2400
## permutations: 2500
## permutations: 2600
## permutations: 2700
## permutations: 2800
## permutations: 2900
## permutations: 3000
## permutations: 3100
## permutations: 3200
## permutations: 3300
## permutations: 3400
## permutations: 3500
## permutations: 3600
## permutations: 3700
## permutations: 3800
## permutations: 3900
## permutations: 4000
## permutations: 4100
## permutations: 4200
## permutations: 4300
## permutations: 4400
## permutations: 4500
## permutations: 4600
## permutations: 4700
## permutations: 4800
## permutations: 4900
## permutations: 5000
## number of rarefaction: 7
## permutations: 100
## permutations: 200
## permutations: 300
## permutations: 400
## permutations: 500
## permutations: 600
## permutations: 700
## permutations: 800
## permutations: 900
## permutations: 1000
## permutations: 1100
## permutations: 1200
## permutations: 1300
## permutations: 1400
## permutations: 1500
## permutations: 1600
## permutations: 1700
## permutations: 1800
## permutations: 1900
## permutations: 2000
## permutations: 2100
## permutations: 2200
## permutations: 2300
## permutations: 2400
## permutations: 2500
## permutations: 2600
## permutations: 2700
## permutations: 2800
## permutations: 2900
## permutations: 3000
## permutations: 3100
## permutations: 3200
## permutations: 3300
## permutations: 3400
## permutations: 3500
## permutations: 3600
## permutations: 3700
## permutations: 3800
## permutations: 3900
## permutations: 4000
## permutations: 4100
## permutations: 4200
## permutations: 4300
## permutations: 4400
## permutations: 4500
## permutations: 4600
## permutations: 4700
## permutations: 4800
## permutations: 4900
## permutations: 5000
## number of rarefaction: 8
## permutations: 100
## permutations: 200
## permutations: 300
## permutations: 400
## permutations: 500
## permutations: 600
## permutations: 700
## permutations: 800
## permutations: 900
## permutations: 1000
## permutations: 1100
## permutations: 1200
## permutations: 1300
## permutations: 1400
## permutations: 1500
## permutations: 1600
## permutations: 1700
## permutations: 1800
## permutations: 1900
## permutations: 2000
## permutations: 2100
## permutations: 2200
## permutations: 2300
## permutations: 2400
## permutations: 2500
## permutations: 2600
## permutations: 2700
## permutations: 2800
## permutations: 2900
## permutations: 3000
## permutations: 3100
## permutations: 3200
## permutations: 3300
## permutations: 3400
## permutations: 3500
## permutations: 3600
## permutations: 3700
## permutations: 3800
## permutations: 3900
## permutations: 4000
## permutations: 4100
## permutations: 4200
## permutations: 4300
## permutations: 4400
## permutations: 4500
## permutations: 4600
## permutations: 4700
## permutations: 4800
## permutations: 4900
## permutations: 5000
## number of rarefaction: 9
## permutations: 100
## permutations: 200
## permutations: 300
## permutations: 400
## permutations: 500
## permutations: 600
## permutations: 700
## permutations: 800
## permutations: 900
## permutations: 1000
## permutations: 1100
## permutations: 1200
## permutations: 1300
## permutations: 1400
## permutations: 1500
## permutations: 1600
## permutations: 1700
## permutations: 1800
## permutations: 1900
## permutations: 2000
## permutations: 2100
## permutations: 2200
## permutations: 2300
## permutations: 2400
## permutations: 2500
## permutations: 2600
## permutations: 2700
## permutations: 2800
## permutations: 2900
## permutations: 3000
## permutations: 3100
## permutations: 3200
## permutations: 3300
## permutations: 3400
## permutations: 3500
## permutations: 3600
## permutations: 3700
## permutations: 3800
## permutations: 3900
## permutations: 4000
## permutations: 4100
## permutations: 4200
## permutations: 4300
## permutations: 4400
## permutations: 4500
## permutations: 4600
## permutations: 4700
## permutations: 4800
## permutations: 4900
## permutations: 5000
## number of rarefaction: 10
## permutations: 100
## permutations: 200
## permutations: 300
## permutations: 400
## permutations: 500
## permutations: 600
## permutations: 700
## permutations: 800
## permutations: 900
## permutations: 1000
## permutations: 1100
## permutations: 1200
## permutations: 1300
## permutations: 1400
## permutations: 1500
## permutations: 1600
## permutations: 1700
## permutations: 1800
## permutations: 1900
## permutations: 2000
## permutations: 2100
## permutations: 2200
## permutations: 2300
## permutations: 2400
## permutations: 2500
## permutations: 2600
## permutations: 2700
## permutations: 2800
## permutations: 2900
## permutations: 3000
## permutations: 3100
## permutations: 3200
## permutations: 3300
## permutations: 3400
## permutations: 3500
## permutations: 3600
## permutations: 3700
## permutations: 3800
## permutations: 3900
## permutations: 4000
## permutations: 4100
## permutations: 4200
## permutations: 4300
## permutations: 4400
## permutations: 4500
## permutations: 4600
## permutations: 4700
## permutations: 4800
## permutations: 4900
## permutations: 5000
## number of rarefaction: 50
## permutations: 100
## permutations: 200
## permutations: 300
## permutations: 400
## permutations: 500
## permutations: 600
## permutations: 700
## permutations: 800
## permutations: 900
## permutations: 1000
## permutations: 1100
## permutations: 1200
## permutations: 1300
## permutations: 1400
## permutations: 1500
## permutations: 1600
## permutations: 1700
## permutations: 1800
## permutations: 1900
## permutations: 2000
## permutations: 2100
## permutations: 2200
## permutations: 2300
## permutations: 2400
## permutations: 2500
## permutations: 2600
## permutations: 2700
## permutations: 2800
## permutations: 2900
## permutations: 3000
## permutations: 3100
## permutations: 3200
## permutations: 3300
## permutations: 3400
## permutations: 3500
## permutations: 3600
## permutations: 3700
## permutations: 3800
## permutations: 3900
## permutations: 4000
## permutations: 4100
## permutations: 4200
## permutations: 4300
## permutations: 4400
## permutations: 4500
## permutations: 4600
## permutations: 4700
## permutations: 4800
## permutations: 4900
## permutations: 5000
## number of rarefaction: 100
## permutations: 100
## permutations: 200
## permutations: 300
## permutations: 400
## permutations: 500
## permutations: 600
## permutations: 700
## permutations: 800
## permutations: 900
## permutations: 1000
## permutations: 1100
## permutations: 1200
## permutations: 1300
## permutations: 1400
## permutations: 1500
## permutations: 1600
## permutations: 1700
## permutations: 1800
## permutations: 1900
## permutations: 2000
## permutations: 2100
## permutations: 2200
## permutations: 2300
## permutations: 2400
## permutations: 2500
## permutations: 2600
## permutations: 2700
## permutations: 2800
## permutations: 2900
## permutations: 3000
## permutations: 3100
## permutations: 3200
## permutations: 3300
## permutations: 3400
## permutations: 3500
## permutations: 3600
## permutations: 3700
## permutations: 3800
## permutations: 3900
## permutations: 4000
## permutations: 4100
## permutations: 4200
## permutations: 4300
## permutations: 4400
## permutations: 4500
## permutations: 4600
## permutations: 4700
## permutations: 4800
## permutations: 4900
## permutations: 5000
## number of rarefaction: 10000
## permutations: 100
## permutations: 200
## permutations: 300
## permutations: 400
## permutations: 500
## permutations: 600
## permutations: 700
## permutations: 800
## permutations: 900
## permutations: 1000
## permutations: 1100
## permutations: 1200
## permutations: 1300
## permutations: 1400
## permutations: 1500
## permutations: 1600
## permutations: 1700
## permutations: 1800
## permutations: 1900
## permutations: 2000
## permutations: 2100
## permutations: 2200
## permutations: 2300
## permutations: 2400
## permutations: 2500
## permutations: 2600
## permutations: 2700
## permutations: 2800
## permutations: 2900
## permutations: 3000
## permutations: 3100
## permutations: 3200
## permutations: 3300
## permutations: 3400
## permutations: 3500
## permutations: 3600
## permutations: 3700
## permutations: 3800
## permutations: 3900
## permutations: 4000
## permutations: 4100
## permutations: 4200
## permutations: 4300
## permutations: 4400
## permutations: 4500
## permutations: 4600
## permutations: 4700
## permutations: 4800
## permutations: 4900
## permutations: 5000
## number of rarefaction: 1e+06
## permutations: 100
## permutations: 200
## permutations: 300
## permutations: 400
## permutations: 500
## permutations: 600
## permutations: 700
## permutations: 800
## permutations: 900
## permutations: 1000
## permutations: 1100
## permutations: 1200
## permutations: 1300
## permutations: 1400
## permutations: 1500
## permutations: 1600
## permutations: 1700
## permutations: 1800
## permutations: 1900
## permutations: 2000
## permutations: 2100
## permutations: 2200
## permutations: 2300
## permutations: 2400
## permutations: 2500
## permutations: 2600
## permutations: 2700
## permutations: 2800
## permutations: 2900
## permutations: 3000
## permutations: 3100
## permutations: 3200
## permutations: 3300
## permutations: 3400
## permutations: 3500
## permutations: 3600
## permutations: 3700
## permutations: 3800
## permutations: 3900
## permutations: 4000
## permutations: 4100
## permutations: 4200
## permutations: 4300
## permutations: 4400
## permutations: 4500
## permutations: 4600
## permutations: 4700
## permutations: 4800
## permutations: 4900
## permutations: 5000
#tab = c(p_permanovaF, p_permanovaD2, p_permanovaD2A2, p_permanovaD2A1, i_seed)
#write.table(t(tab), file = paste("", data_full, "_dist", dist_type, ".txt", sep=""),
# quote=FALSE, append=TRUE, row.names=FALSE, col.names=FALSE)
p_permanovaF
## rarefy1 rarefy2 rarefy3 rarefy4 rarefy5 rarefy6 rarefy7
## 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996
## rarefy8 rarefy9 rarefy10 rarefy11 rarefy12 rarefy13 rarefy14
## 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996
## rarefy15 rarefy16 rarefy17 rarefy18 rarefy19 rarefy20 rarefy21
## 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996
## rarefy22 rarefy23 rarefy24 rarefy25 rarefy26 rarefy27 rarefy28
## 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996
## rarefy29 rarefy30 rarefy31 rarefy32 rarefy33 rarefy34 rarefy35
## 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996
## rarefy36 rarefy37 rarefy38 rarefy39 rarefy40 rarefy41 rarefy42
## 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996
## rarefy43 rarefy44 rarefy45 rarefy46 rarefy47 rarefy48 rarefy49
## 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996
## rarefy50 rarefy51 rarefy52 rarefy53 rarefy54 rarefy55 rarefy56
## 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996
## rarefy57 rarefy58 rarefy59 rarefy60 rarefy61 rarefy62 rarefy63
## 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996
## rarefy64 rarefy65 rarefy66 rarefy67 rarefy68 rarefy69 rarefy70
## 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996
## rarefy71 rarefy72 rarefy73 rarefy74 rarefy75 rarefy76 rarefy77
## 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996
## rarefy78 rarefy79 rarefy80 rarefy81 rarefy82 rarefy83 rarefy84
## 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996
## rarefy85 rarefy86 rarefy87 rarefy88 rarefy89 rarefy90 rarefy91
## 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996
## rarefy92 rarefy93 rarefy94 rarefy95 rarefy96 rarefy97 rarefy98
## 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996
## rarefy99 rarefy100
## 0.00019996 0.00019996
p_permanovaD2
## [1] 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996
## [7] 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996 0.00019996
p_permanovaD2A2
## [1] 0.00019996
p_permanovaD2A1
## [1] 0.00019996
res_permanovaD2$R.squared
## [1] 0.3931708
Results
4.1. Host depletion lowered host DNA % and increased final reads & species richness
4.2. The increase of species richnees was majorly due to increased final reads, not by a direct effect from treatment.
4.3. Effect of each treatment on beta-diveristy was sample type specific.
4.4. BAL - Molysis, nasal swab - QIAamp, and sputum - host zero was selected as their best method for depleting host DNAs.
Results summary
Summary table
summary_bal <- matrix(nrow=5,ncol=3) %>% data.frame() %>%
rename(Issues = X1, `Species richness` = X2, `Beta diversity change` = X3) %>%
rownames_to_column("x") %>% mutate(x = c("lyPMA", "Benzonase", "Host zero", "Molysis", "QIAamp"),
Issues = c("High host%",
"High host%",
"-",
"-",
"High host%"),
`Species richness` = c("+0.8",
"+6.4",
"+9.2",
"+18.8",
"+10.0"),
`Beta diversity change` = c("LM",
"-",
"LM",
"-",
"LM")) %>%
column_to_rownames("x")
summary_ns <- matrix(nrow=5,ncol=3) %>% data.frame() %>%
rename(Issues = X1, `Species richness` = X2, `Beta diversity change` = X3) %>%
rownames_to_column("x") %>% mutate(x = c("lyPMA", "Benzonase", "Host zero", "Molysis", "QIAamp"),
Issues = c("High host %<br>gram-stain %",
"High host%",
"-",
"Library fail",
"Library fail"),
`Species richness` = c("-4.2",
"-0.8",
"+8.8",
"+5.4",
"+7.4"),
`Beta diversity change` = c("PERM<br>LM",
"-",
"-",
"-",
"-")) %>%
column_to_rownames("x")
summary_spt <- matrix(nrow=5,ncol=3) %>% data.frame() %>%
rename(Issues = X1, `Species richness` = X2, `Beta diversity change` = X3) %>%
rownames_to_column("x") %>% mutate(x = c("lyPMA", "Benzonase", "Host zero", "Molysis", "QIAamp"),
Issues = c("Gram-stain %",
"Gram-stain %",
"Gram-stain %",
"Gram-stain %",
"Gram-stain %"),
`Species richness` = c("+34.8",
"+63.0",
"+96.0",
"+104.6",
"+79.4"),
`Beta diversity change` = c("-",
"LM",
"LM",
"PERM<br>LM",
"PERM<br>LM")) %>%
column_to_rownames("x")
table3 <- cbind(summary_bal, summary_ns, summary_spt) %>%
kbl(format = "html", escape = 0) %>%
add_header_above(c(" " = 1, "BAL" = 3, "Nasal swab" = 3,"Sputum"= 3)) %>%
kable_styling(full_width = 0, html_font = "serif")
table3
| Issues | Species richness | Beta diversity change | Issues | Species richness | Beta diversity change | Issues | Species richness | Beta diversity change | |
|---|---|---|---|---|---|---|---|---|---|
| lyPMA | High host% | +0.8 | LM |
High host % gram-stain % |
-4.2 |
PERM LM |
Gram-stain % | +34.8 |
|
| Benzonase | High host% | +6.4 |
|
High host% | -0.8 |
|
Gram-stain % | +63.0 | LM |
| Host zero |
|
+9.2 | LM |
|
+8.8 |
|
Gram-stain % | +96.0 | LM |
| Molysis |
|
+18.8 |
|
Library fail | +5.4 |
|
Gram-stain % | +104.6 |
PERM LM |
| QIAamp | High host% | +10.0 | LM | Library fail | +7.4 |
|
Gram-stain % | +79.4 |
PERM LM |
save_kable(table3, file = "/Users/minsikkim/Dropbox (Partners HealthCare)/Project_SICAS2_microbiome/7_Manuscripts/2022_MGK_Host_Depletion/Figures/table3.html", self_contained = T)
Done.
Bibliography
#===============================================================================
#BTC.LineZero.Footer.1.1.0
#===============================================================================
#R markdown citation generator.
#===============================================================================
#RLB.Dependencies:
# magrittr, pacman, stringr
#=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
#BTC.Dependencies:
# LineZero.Header
#===============================================================================
#Generates citations for each explicitly loaded library.
#=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
str_libraries <- c("r", str_libraries)
for (str_libraries in str_libraries) {
str_libraries |>
pacman::p_citation() |>
print(bibtex = FALSE) |>
capture.output() %>%
.[-1:-3] %>% .[. != ""] |>
stringr::str_squish() |>
stringr::str_replace("_", "") |>
cat()
cat("\n")
}
## R Core Team (2022). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL https://www.R-project.org/. We have invested a lot of time and effort in creating R, please cite it when using it for data analysis. See also 'citation("pkgname")' for citing R packages.
## Wickham H, Bryan J (2023). readxl: Read Excel Files_. R package version 1.4.2, <https://CRAN.R-project.org/package=readxl>.
## phyloseq: An R package for reproducible interactive analysis and graphics of microbiome census data. Paul J. McMurdie and Susan Holmes (2013) PLoS ONE 8(4):e61217.
## Wickham H, Averick M, Bryan J, Chang W, McGowan LD, François R, Grolemund G, Hayes A, Henry L, Hester J, Kuhn M, Pedersen TL, Miller E, Bache SM, Müller K, Ooms J, Robinson D, Seidel DP, Spinu V, Takahashi K, Vaughan D, Wilke C, Woo K, Yutani H (2019). "Welcome to the tidyverse." Journal of Open Source Software_, *4*(43), 1686. doi:10.21105/joss.01686 <https://doi.org/10.21105/joss.01686>.
## Rinker, T. W. & Kurkiewicz, D. (2017). pacman: Package Management for R. version 0.5.0. Buffalo, New York. http://github.com/trinker/pacman
## Garbett SP, Stephens J, Simonov K, Xie Y, Dong Z, Wickham H, Horner J, reikoch, Beasley W, O'Connor B, Warnes GR, Quinn M, Kamvar ZN (2023). yaml: Methods to Convert R Data to YAML and Back_. R package version 2.3.7, <https://CRAN.R-project.org/package=yaml>. ATTENTION: This citation information has been auto-generated from the package DESCRIPTION file and may need manual editing, see 'help("citation")'.
## H. Wickham. ggplot2: Elegant Graphics for Data Analysis. Springer-Verlag New York, 2016.
## Oksanen J, Simpson G, Blanchet F, Kindt R, Legendre P, Minchin P, O'Hara R, Solymos P, Stevens M, Szoecs E, Wagner H, Barbour M, Bedward M, Bolker B, Borcard D, Carvalho G, Chirico M, De Caceres M, Durand S, Evangelista H, FitzJohn R, Friendly M, Furneaux B, Hannigan G, Hill M, Lahti L, McGlinn D, Ouellette M, Ribeiro Cunha E, Smith T, Stier A, Ter Braak C, Weedon J (2022). vegan: Community Ecology Package. R package version 2.6-4, <https://CRAN.R-project.org/package=vegan>.
## Leo Lahti et al. microbiome R package. URL: http://microbiome.github.io
## Kassambara A (2023). ggpubr: 'ggplot2' Based Publication Ready Plots. R package version 0.6.0, <https://CRAN.R-project.org/package=ggpubr>.
## Simon Garnier, Noam Ross, Robert Rudis, Antônio P. Camargo, Marco Sciaini, and Cédric Scherer (2021). Rvision - Colorblind-Friendly Color Maps for R. R package version 0.6.2.
## Davis NM, Proctor D, Holmes SP, Relman DA, Callahan BJ (2017). "Simple statistical identification and removal of contaminant sequences in marker-gene and metagenomics data." bioRxiv_, 221499. doi:10.1101/221499 <https://doi.org/10.1101/221499>.
## Auguie B (2017). gridExtra: Miscellaneous Functions for "Grid" Graphics. R package version 2.3, <https://CRAN.R-project.org/package=gridExtra>.
## Kassambara A (2023). ggpubr: 'ggplot2' Based Publication Ready Plots. R package version 0.6.0, <https://CRAN.R-project.org/package=ggpubr>.
## Douglas Bates, Martin Maechler, Ben Bolker, Steve Walker (2015). Fitting Linear Mixed-Effects Models Using lme4. Journal of Statistical Software, 67(1), 1-48. doi:10.18637/jss.v067.i01.
## Kuznetsova A, Brockhoff PB, Christensen RHB (2017). "lmerTest Package: Tests in Linear Mixed Effects Models." Journal of Statistical Software, *82*(13), 1-26. doi:10.18637/jss.v082.i13 <https://doi.org/10.18637/jss.v082.i13>.
## Ooms J (2023). writexl: Export Data Frames to Excel 'xlsx' Format_. R package version 1.4.2, <https://CRAN.R-project.org/package=writexl>.
## Gonçalves da Silva A (2017). harrietr: Wrangle Phylogenetic Distance Matrices and Other Utilities. R package version 0.2.3, <https://CRAN.R-project.org/package=harrietr>.
## Mallick H et al. (2020). Multivariable Association in Population-scale Meta-omics Studies, http://huttenhower.sph.harvard.edu/maaslin2. To cite the MaAsLin 2 software, please use: Mallick H, Rahnavard A, McIver LJ (2020). MaAsLin 2: Multivariable Association in Population-scale Meta-omics Studies. R/Bioconductor package, http://huttenhower.sph.harvard.edu/maaslin2.
## Wilke C, Wiernik B (2022). ggtext: Improved Text Rendering Support for 'ggplot2'. R package version 0.1.2, <https://CRAN.R-project.org/package=ggtext>.
## Aphalo P (2022). ggpmisc: Miscellaneous Extensions to 'ggplot2'_. R package version 0.5.2, <https://CRAN.R-project.org/package=ggpmisc>.
## Auguie B (2017). gridExtra: Miscellaneous Functions for "Grid" Graphics. R package version 2.3, <https://CRAN.R-project.org/package=gridExtra>.
## Wood S, Scheipl F (2020). gamm4: Generalized Additive Mixed Models using 'mgcv' and 'lme4'. R package version 0.2-6, <https://CRAN.R-project.org/package=gamm4>. ATTENTION: This citation information has been auto-generated from the package DESCRIPTION file and may need manual editing, see 'help("citation")'.
## Hadley Wickham (2007). Reshaping Data with the reshape Package. Journal of Statistical Software, 21(12), 1-20. URL http://www.jstatsoft.org/v21/i12/.
## Zhu H (2021). kableExtra: Construct Complex Table with 'kable' and Pipe Syntax. R package version 1.3.4, <https://CRAN.R-project.org/package=kableExtra>.
## Yihui Xie (2023). knitr: A General-Purpose Package for Dynamic Report Generation in R. R package version 1.42. Yihui Xie (2015) Dynamic Documents with R and knitr. 2nd edition. Chapman and Hall/CRC. ISBN 978-1498716963 Yihui Xie (2014) knitr: A Comprehensive Tool for Reproducible Research in R. In Victoria Stodden, Friedrich Leisch and Roger D. Peng, editors, Implementing Reproducible Computational Research. Chapman and Hall/CRC. ISBN 978-1466561595
## Guangchuang Yu. (2022). Data Integration, Manipulation and Visualization of Phylogenetic Trees (1st edition). Chapman and Hall/CRC. Shuangbin Xu, Lin Li, Xiao Luo, Meijun Chen, Wenli Tang, Li Zhan, Zehan Dai, Tommy T. Lam, Yi Guan, Guangchuang Yu. Ggtree: A serialized data object for visualization of a phylogenetic tree and annotation data. iMeta 2022, 4(1):e56. doi:10.1002/imt2.56 Guangchuang Yu. Using ggtree to visualize data on tree-like structures. Current Protocols in Bioinformatics, 2020, 69:e96. doi: 10.1002/cpbi.96 Guangchuang Yu, Tommy Tsan-Yuk Lam, Huachen Zhu, Yi Guan. Two methods for mapping and visualizing associated data on phylogeny using ggtree. Molecular Biology and Evolution 2018, 35(2):3041-3043. doi: 10.1093/molbev/msy194 Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, Tommy Tsan-Yuk Lam. ggtree: an R package for visualization and annotation of phylogenetic trees with their covariates and other associated data. Methods in Ecology and Evolution 2017, 8(1):28-36. doi:10.1111/2041-210X.12628
## John Fox and Sanford Weisberg (2019). An {R} Companion to Applied Regression, Third Edition. Thousand Oaks CA: Sage. URL: https://socialsciences.mcmaster.ca/jfox/Books/Companion/
## Hu Y, Satten GA (2023). LDM: Testing Hypotheses about the Microbiome using the Linear Decomposition Model. R package version 6.0. ATTENTION: This citation information has been auto-generated from the package DESCRIPTION file and may need manual editing, see 'help("citation")'.
## Dustin Tingley, Teppei Yamamoto, Kentaro Hirose, Luke Keele, Kosuke Imai (2014). mediation: R Package for Causal Mediation Analysis. Journal of Statistical Software, 59(5), 1-38. URL http://www.jstatsoft.org/v59/i05/. For the underlying methods please cite the following papers: Kosuke Imai, Luke Keele and Teppei Yamamoto (2010). Identification, Inference and Sensitivity Analysis for Causal Mediation Effects. Statistical Science, 25(1), 51-71. Kosuke Imai, Luke Keele and Dustin Tingley (2010). A General Approach to Causal Mediation Analysis. Psychological Methods, 15(4), 309-334. Kosuke Imai, Luke Keele, Dustin Tingley and Teppei Yamamoto (2011). Unpacking the Black Box of Causality: Learning about Causal Mechanisms from Experimental and Observational Studies. American Political Science Review, 105(4), 765-789. Kosuke Imai and Teppei Yamamoto (2013). Identification and Sensitivity Analysis for Multiple Causal Mechanisms: Revisiting Evidence from Framing Experiments. Political Analysis, 21(2), 141-171. Kosuke Imai, Luke Keele, Dustin Tingley and Teppei Yamamoto (2010). Causal Mediation Analysis Using R. In Advances in Social Science Research Using R, ed. H. D. Vinod, New York: Springer-Verlag.
#===============================================================================